From be067861c6ab4df53a45e332caf53c163330290d Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sat, 11 Oct 2025 10:43:39 +0800
Subject: [PATCH 01/30]  [Frontend] Improve the performance of
 `is_reasoning_end` (#25735)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/reasoning/basic_parsers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index b4106a4f57945..f47ffe6212caf 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -59,7 +59,8 @@ class BaseThinkingReasoningParser(ReasoningParser):
             )
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.end_token_id in input_ids
+        end_token_id = self.end_token_id
+        return any(input_id == end_token_id for input_id in reversed(input_ids))
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """

From 8f8474fbe357c587fee838cf76e59c465593a2fd Mon Sep 17 00:00:00 2001
From: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
Date: Sat, 11 Oct 2025 10:34:42 +0530
Subject: [PATCH 02/30] [CI/Build] Fix ppc64le CPU build and tests (#22443)

Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
---
 .../hardware_ci/run-cpu-test-ppc64le.sh       | 15 +++--
 cmake/cpu_extension.cmake                     |  2 +-
 docker/Dockerfile.ppc64le                     | 64 +++++++++++++------
 3 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 36bcb015d308e..39ea180173081 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -25,25 +25,28 @@ function cpu_tests() {
 
   # offline inference
   podman exec -it "$container_id" bash -c "
-    set -e
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+    set -xve
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
 
   # Run basic model test
   podman exec -it "$container_id" bash -c "
-    set -e
+    set -evx
     pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
     pip install sentence-transformers datamodel_code_generator
-    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
+
+    # Note: disable Bart until supports V1
+    # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
+    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
+    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
 
 export container_id
 export -f cpu_tests
-timeout 40m bash -c cpu_tests
+timeout 120m bash -c cpu_tests
 
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index c962564c8da08..a6e53588f4f0f 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -309,4 +309,4 @@ define_gpu_extension_target(
     WITH_SOABI
 )
 
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
\ No newline at end of file
diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
index 5eaef4ea980de..ad9eae94b83dd 100644
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -1,4 +1,4 @@
-ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
+ARG BASE_UBI_IMAGE_TAG=9.6-1754584681
 
 ###############################################################
 # Stage to build openblas
@@ -7,7 +7,7 @@ ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
 
 ARG MAX_JOBS
-ARG OPENBLAS_VERSION=0.3.29
+ARG OPENBLAS_VERSION=0.3.30
 RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
     && source /opt/rh/gcc-toolset-13/enable \
     && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
@@ -38,7 +38,7 @@ RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel
 FROM centos-deps-builder AS base-builder
 
 ARG PYTHON_VERSION=3.12
-ARG OPENBLAS_VERSION=0.3.29
+ARG OPENBLAS_VERSION=0.3.30
 
 # Set Environment Variables for venv, cargo & openblas
 ENV VIRTUAL_ENV=/opt/vllm
@@ -61,7 +61,7 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
        pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
        libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
        harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
-       python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
+       python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip clang-devel \
     && dnf clean all \
     && PREFIX=/usr/local make -C /openblas install \
     && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
@@ -79,9 +79,9 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
 FROM base-builder AS torch-builder
 
 ARG MAX_JOBS
-ARG TORCH_VERSION=2.6.0
+ARG TORCH_VERSION=2.7.0
 ARG _GLIBCXX_USE_CXX11_ABI=1
-ARG OPENBLAS_VERSION=0.3.29
+ARG OPENBLAS_VERSION=0.3.30
 
 RUN --mount=type=cache,target=/root/.cache/uv \
     source /opt/rh/gcc-toolset-13/enable &&  \
@@ -93,7 +93,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     MAX_JOBS=${MAX_JOBS:-$(nproc)} \
     PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
 
-ARG TORCHVISION_VERSION=0.21.0
+ARG TORCHVISION_VERSION=0.22.0
 ARG TORCHVISION_USE_NVJPEG=0
 ARG TORCHVISION_USE_FFMPEG=0
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     BUILD_VERSION=${TORCHVISION_VERSION} \
     uv build --wheel --out-dir /torchwheels/ --no-build-isolation
 
-ARG TORCHAUDIO_VERSION=2.6.0
+ARG TORCHAUDIO_VERSION=2.7.0
 ARG BUILD_SOX=1
 ARG BUILD_KALDI=1
 ARG BUILD_RNNT=1
@@ -128,7 +128,7 @@ FROM base-builder AS arrow-builder
 
 ARG MAX_JOBS
 ARG PYARROW_PARALLEL
-ARG PYARROW_VERSION=19.0.1
+ARG PYARROW_VERSION=21.0.0
 RUN --mount=type=cache,target=/root/.cache/uv \
     source /opt/rh/gcc-toolset-13/enable && \
     git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
@@ -145,7 +145,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     make install -j ${MAX_JOBS:-$(nproc)} && \
     cd ../../python/ && \
     uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
-    pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
     PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
     python setup.py build_ext \
     --build-type=release --bundle-arrow-cpp \
@@ -187,6 +186,23 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V
     && make -j ${MAX_JOBS:-$(nproc)}
 
 
+###############################################################
+# Stage to build numba 
+###############################################################
+
+FROM base-builder AS numba-builder
+
+ARG MAX_JOBS
+ARG NUMBA_VERSION=0.61.2
+
+# Clone all required dependencies
+RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
+    git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
+    cd ./numba && \
+    if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
+       sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
+    fi && python -m build --wheel --installer=uv --outdir /numbawheels/
+
 ###############################################################
 # Stage to build vllm - this stage builds and installs
 # vllm, tensorizer and vllm-tgis-adapter and builds uv cache
@@ -199,6 +215,7 @@ COPY --from=torch-builder /tmp/control /dev/null
 COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
 COPY --from=numa-builder /tmp/control /dev/null
+COPY --from=numba-builder /tmp/control /dev/null
 
 ARG VLLM_TARGET_DEVICE=cpu
 ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
@@ -206,6 +223,8 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 # this step installs vllm and populates uv cache
 # with all the transitive dependencies
 RUN --mount=type=cache,target=/root/.cache/uv \
+    dnf install llvm15 llvm15-devel -y && \
+    rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \
     source /opt/rh/gcc-toolset-13/enable && \
     git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
     uv pip install maturin && \
@@ -215,15 +234,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
     --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
     --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
+    --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
     --mount=type=bind,src=.,dst=/src/,rw \
     source /opt/rh/gcc-toolset-13/enable && \
-    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
+    export PATH=$PATH:/usr/lib64/llvm15/bin && \
+    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \
     sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
-    uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
+    sed -i -e 's/.*sentencepiece.*//g' /src/pyproject.toml /src/requirements/*.txt && \
+    uv pip install sentencepiece==0.2.0 pandas pythran nanobind pybind11 /hf_wheels/*.whl && \
     make -C /numactl install && \
     # sentencepiece.pc is in some pkgconfig inside uv cache
     export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
-    uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
+    nanobind_DIR=$(uv pip show nanobind | grep Location | sed 's/^Location: //;s/$/\/nanobind\/cmake/') && uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
     cd /src/ && \
     uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
     uv pip install /vllmwheel/*.whl
@@ -250,7 +272,7 @@ RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${L
 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
 
 ARG PYTHON_VERSION=3.12
-ARG OPENBLAS_VERSION=0.3.29
+ARG OPENBLAS_VERSION=0.3.30
 
 # Set Environment Variables for venv & openblas
 ENV VIRTUAL_ENV=/opt/vllm
@@ -268,6 +290,7 @@ COPY --from=vllmcache-builder /tmp/control /dev/null
 COPY --from=numa-builder /tmp/control /dev/null
 COPY --from=lapack-builder /tmp/control /dev/null
 COPY --from=openblas-builder /tmp/control /dev/null
+COPY --from=numba-builder /tmp/control /dev/null
 
 # install gcc-11, python, openblas, numactl, lapack
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -276,13 +299,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
     rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
     microdnf install --nodocs -y \
-    tar findutils openssl \
+    libomp tar findutils openssl llvm15 llvm15-devel \
     pkgconfig xsimd g++ gcc-fortran libsndfile \
     libtiff libjpeg openjpeg2 zlib zeromq \
     freetype lcms2 libwebp tcl tk utf8proc \
-    harfbuzz fribidi libraqm libimagequant libxcb \
+    harfbuzz fribidi libraqm libimagequant libxcb util-linux \
     python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
-    && microdnf clean all \
+    && export PATH=$PATH:/usr/lib64/llvm15/bin && microdnf clean all \
     && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
     && python -m pip install -U pip uv --no-cache \
     && make -C /numactl install \
@@ -298,7 +321,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
     --mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
     --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
-    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
+    --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
+    export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && uv pip install sentencepiece==0.2.0 && \
+    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
+
 
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
@@ -314,4 +340,4 @@ WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
-ENTRYPOINT ["vllm", "serve"]
+ENTRYPOINT ["vllm", "serve"]
\ No newline at end of file

From 27ed39a347c56ac23f25169b7c86d7f459b1ac7f Mon Sep 17 00:00:00 2001
From: liuzhenwei <zhenwei.liu@intel.com>
Date: Sat, 11 Oct 2025 13:15:23 +0800
Subject: [PATCH 03/30] [XPU] Upgrade NIXL to remove CUDA dependency (#26570)

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
---
 .buildkite/scripts/hardware_ci/run-xpu-test.sh |  1 -
 docker/Dockerfile.xpu                          |  5 +++++
 requirements/xpu.txt                           |  1 -
 tools/install_nixl_from_source_ubuntu.py       |  1 +
 vllm/platforms/xpu.py                          | 15 ++++++++-------
 5 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 2fd7265fa5366..250a64fdd071c 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -44,6 +44,5 @@ docker run \
     pytest -v -s v1/structured_output
     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
     pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
-    pytest -v -s v1/test_metrics
     pytest -v -s v1/test_serial_utils.py
 '
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index ffc3abd389653..49ea39cad5128 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
+
+# install nixl from source code
+RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
+
 ENTRYPOINT ["vllm", "serve"]
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 5d52400e50bc6..d14b631aa9364 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -10,7 +10,6 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.61.2 # Required for N-gram speculative decoding
-nixl==0.3.0 # for PD disaggregation
 torch==2.8.0+xpu
 torchaudio
 torchvision
diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
index c903e3f1d3f18..c808b01d2e94b 100644
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -135,6 +135,7 @@ def build_and_install_prerequisites(args):
         "--enable-devel-headers",
         "--with-verbs",
         "--enable-mt",
+        "--with-ze=no",
     ]
     run_command(configure_command, cwd=ucx_source_path)
     run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path)
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index e0c8a6605b7d4..b75b52938839b 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -54,6 +54,14 @@ class XPUPlatform(Platform):
         has_sink: bool,
         use_sparse,
     ) -> str:
+        from vllm.v1.attention.backends.utils import set_kv_cache_layout
+
+        set_kv_cache_layout("NHD")
+        logger.info(
+            "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
+            "only NHD layout is supported by XPU attention kernels."
+        )
+
         from vllm.attention.backends.registry import _Backend
 
         if use_sparse:
@@ -190,13 +198,6 @@ class XPUPlatform(Platform):
                 vllm_config.scheduler_config.max_model_len,
                 DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
-        from vllm.v1.attention.backends.utils import set_kv_cache_layout
-
-        set_kv_cache_layout("NHD")
-        logger.info(
-            "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
-            "only NHD layout is supported by XPU attention kernels."
-        )
 
     @classmethod
     def support_hybrid_kv_cache(cls) -> bool:

From ddaff2938e0b78b2d6237f6f7975ac19167cb04e Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Fri, 10 Oct 2025 22:17:24 -0700
Subject: [PATCH 04/30] [MM] Move Qwen3Omni MRoPE impl to model file (#26608)

Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../layers/rotary_embedding/mrope.py          | 361 +-----------------
 .../models/qwen3_omni_moe_thinker.py          | 355 +++++++++++++++--
 vllm/model_executor/models/vision.py          |  37 ++
 vllm/v1/worker/gpu_model_runner.py            |   2 +-
 4 files changed, 368 insertions(+), 387 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index 0a13543c82e19..ebfe9257c6c45 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -426,7 +426,7 @@ class MRotaryEmbedding(RotaryEmbedding):
     ) -> tuple[torch.Tensor, int]:
         from vllm.transformers_utils.config import thinker_uses_mrope
 
-        if thinker_uses_mrope(hf_config):
+        if thinker_uses_mrope(hf_config) and hf_config.model_type == "qwen2_5_omni":
             return cls._omni_get_input_positions_tensor(
                 input_tokens=input_tokens,
                 hf_config=hf_config,
@@ -1119,339 +1119,6 @@ class MRotaryEmbedding(RotaryEmbedding):
 
         return llm_positions, mrope_position_delta
 
-    @classmethod
-    def _omni3_get_input_positions_tensor(
-        cls,
-        config,
-        input_ids: torch.Tensor,
-        image_grid_thw: torch.Tensor,
-        video_grid_thw: torch.Tensor,
-        use_audio_in_video: bool = False,
-        audio_seqlens: Optional[torch.Tensor] = None,
-        second_per_grids: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
-            input_lengths_leave = input_lengths % 100
-            feat_lengths = (input_lengths_leave - 1) // 2 + 1
-            output_lengths = (
-                ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
-            )
-            return output_lengths
-
-        if input_ids is None or input_ids.ndim != 1:
-            raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids")
-
-        seq_len = input_ids.shape[0]
-        device = input_ids.device
-        dtype = input_ids.dtype
-
-        if image_grid_thw is not None:
-            image_grid_thw = image_grid_thw.to(device=device, dtype=torch.long)
-        if video_grid_thw is not None:
-            video_grid_thw = video_grid_thw.to(device=device, dtype=torch.long)
-
-        if second_per_grids is None:
-            if video_grid_thw is not None and video_grid_thw.numel() > 0:
-                second_per_grids = torch.ones(
-                    video_grid_thw.shape[0], dtype=torch.float32, device=device
-                )
-            else:
-                second_per_grids = torch.tensor([], dtype=torch.float32, device=device)
-        else:
-            second_per_grids = second_per_grids.to(device=device, dtype=torch.float32)
-
-        if audio_seqlens is not None:
-            audio_seqlens = audio_seqlens.to(device=device, dtype=torch.long)
-
-        spatial_merge_size = config.vision_config.spatial_merge_size
-        image_token_id = config.image_token_id
-        video_token_id = config.video_token_id
-        audio_token_id = config.audio_token_id
-        vision_start_token_id = config.vision_start_token_id
-        audio_start_token_id = config.audio_start_token_id
-        position_id_per_seconds = config.position_id_per_seconds
-
-        vision_start_indices = torch.argwhere(
-            input_ids == vision_start_token_id
-        ).squeeze(1)
-        if vision_start_indices.numel() > 0:
-            vision_tokens = input_ids[vision_start_indices + 1]
-        else:
-            vision_tokens = input_ids.new_empty((0,), dtype=input_ids.dtype)
-        audio_nums = torch.sum(input_ids == audio_start_token_id)
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (
-            (vision_tokens == audio_start_token_id).sum()
-            if use_audio_in_video
-            else (vision_tokens == video_token_id).sum()
-        )
-
-        input_tokens = input_ids.tolist()
-        llm_pos_ids_list: list[torch.Tensor] = []
-        st = 0
-        image_idx = 0
-        video_idx = 0
-        audio_idx = 0
-        remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums  # noqa: E501
-        multimodal_nums = (
-            image_nums + audio_nums
-            if use_audio_in_video
-            else image_nums + video_nums + audio_nums
-        )  # noqa: E501
-
-        for _ in range(multimodal_nums):
-            st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-            if (image_token_id in input_tokens or video_token_id in input_tokens) and (
-                remain_videos > 0 or remain_images > 0
-            ):
-                ed_vision_start = input_tokens.index(vision_start_token_id, st)
-            else:
-                ed_vision_start = len(input_tokens) + 1
-            if audio_token_id in input_tokens and remain_audios > 0:
-                ed_audio_start = input_tokens.index(audio_start_token_id, st)
-            else:
-                ed_audio_start = len(input_tokens) + 1
-            min_ed = min(ed_vision_start, ed_audio_start)
-
-            if min_ed == ed_audio_start:
-                text_len = min_ed - st
-                if text_len != 0:
-                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len, device=device, dtype=torch.long)
-                        .view(1, -1)
-                        .expand(3, -1)
-                        + st_idx
-                    )
-                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                bos_len = 1
-                llm_pos_ids_list.append(
-                    torch.arange(bos_len, device=device, dtype=torch.long)
-                    .view(1, -1)
-                    .expand(3, -1)
-                    + st_idx
-                )
-                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                audio_len = _get_feat_extract_output_lengths(audio_seqlens[audio_idx])
-                llm_pos_ids = (
-                    torch.arange(audio_len, device=device, dtype=torch.long)
-                    .view(1, -1)
-                    .expand(3, -1)
-                    + st_idx
-                )
-                llm_pos_ids_list.append(llm_pos_ids)
-                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                eos_len = 1
-                llm_pos_ids_list.append(
-                    torch.arange(eos_len, device=device, dtype=torch.long)
-                    .view(1, -1)
-                    .expand(3, -1)
-                    + st_idx
-                )
-                st += text_len + bos_len + audio_len + eos_len
-                audio_idx += 1
-                remain_audios -= 1
-            elif (
-                min_ed == ed_vision_start
-                and input_ids[ed_vision_start + 1] == image_token_id
-            ):
-                text_len = min_ed - st
-                if text_len != 0:
-                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len, device=device, dtype=torch.long)
-                        .view(1, -1)
-                        .expand(3, -1)
-                        + st_idx
-                    )
-                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                bos_len = 1
-                llm_pos_ids_list.append(
-                    torch.arange(bos_len, device=device, dtype=torch.long)
-                    .view(1, -1)
-                    .expand(3, -1)
-                    + st_idx
-                )
-                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                grid_t = image_grid_thw[image_idx][0]
-                grid_hs = image_grid_thw[:, 1]
-                grid_ws = image_grid_thw[:, 2]
-                t_index = torch.arange(grid_t, device=device) * position_id_per_seconds
-                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
-                    st_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws
-                )
-                image_len = image_grid_thw[image_idx].prod() // (spatial_merge_size**2)
-                llm_pos_ids_list.append(llm_pos_ids)
-                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                eos_len = 1
-                llm_pos_ids_list.append(
-                    torch.arange(eos_len, device=device, dtype=torch.long)
-                    .view(1, -1)
-                    .expand(3, -1)
-                    + st_idx
-                )
-                st += text_len + bos_len + image_len + eos_len
-                image_idx += 1
-                remain_images -= 1
-            elif (
-                min_ed == ed_vision_start
-                and input_ids[ed_vision_start + 1] == video_token_id
-                and not use_audio_in_video
-            ):
-                text_len = min_ed - st
-                if text_len != 0:
-                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len, device=device, dtype=torch.long)
-                        .view(1, -1)
-                        .expand(3, -1)
-                        + st_idx
-                    )
-                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                bos_len = 1
-                llm_pos_ids_list.append(
-                    torch.arange(bos_len, device=device, dtype=torch.long)
-                    .view(1, -1)
-                    .expand(3, -1)
-                    + st_idx
-                )
-                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                grid_t = video_grid_thw[video_idx][0]
-                grid_hs = video_grid_thw[:, 1]
-                grid_ws = video_grid_thw[:, 2]
-                t_index = (
-                    torch.arange(grid_t, device=device)
-                    * float(second_per_grids[video_idx].item())
-                    * position_id_per_seconds
-                )
-                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
-                    st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
-                )
-                video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
-                llm_pos_ids_list.append(llm_pos_ids)
-                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                eos_len = 1
-                llm_pos_ids_list.append(
-                    torch.arange(eos_len, device=device, dtype=torch.long)
-                    .view(1, -1)
-                    .expand(3, -1)
-                    + st_idx
-                )
-                st += text_len + bos_len + video_len + eos_len
-                video_idx += 1
-                remain_videos -= 1
-            elif (
-                min_ed == ed_vision_start
-                and ed_vision_start + 1 == ed_audio_start
-                and use_audio_in_video
-            ):
-                text_len = min_ed - st
-                if text_len != 0:
-                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len, device=device, dtype=torch.long)
-                        .view(1, -1)
-                        .expand(3, -1)
-                        + st_idx
-                    )
-                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                bos_len = 1
-                bos_block = (
-                    torch.arange(bos_len, device=device, dtype=torch.long)
-                    .view(1, -1)
-                    .expand(3, -1)
-                    + st_idx
-                )
-                llm_pos_ids_list.append(bos_block)
-                llm_pos_ids_list.append(bos_block)
-                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                audio_len = _get_feat_extract_output_lengths(audio_seqlens[audio_idx])
-                audio_llm_pos_ids = (
-                    torch.arange(audio_len, device=device, dtype=torch.long)
-                    .view(1, -1)
-                    .expand(3, -1)
-                    + st_idx
-                )
-                grid_t = video_grid_thw[video_idx][0]
-                grid_hs = video_grid_thw[:, 1]
-                grid_ws = video_grid_thw[:, 2]
-                t_index = (
-                    torch.arange(grid_t, device=device)
-                    * float(second_per_grids[video_idx].item())
-                    * position_id_per_seconds
-                )
-                video_llm_pos_ids = cls._get_llm_pos_ids_for_vision(
-                    st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
-                )
-                video_data_index, audio_data_index = 0, 0
-                while (
-                    video_data_index < video_llm_pos_ids.shape[-1]
-                    and audio_data_index < audio_llm_pos_ids.shape[-1]
-                ):
-                    if (
-                        video_llm_pos_ids[0][video_data_index]
-                        <= audio_llm_pos_ids[0][audio_data_index]
-                    ):
-                        llm_pos_ids_list.append(
-                            video_llm_pos_ids[
-                                :, video_data_index : video_data_index + 1
-                            ]
-                        )
-                        video_data_index += 1
-                    else:
-                        llm_pos_ids_list.append(
-                            audio_llm_pos_ids[
-                                :, audio_data_index : audio_data_index + 1
-                            ]
-                        )
-                        audio_data_index += 1
-                if video_data_index < video_llm_pos_ids.shape[-1]:
-                    llm_pos_ids_list.append(
-                        video_llm_pos_ids[
-                            :, video_data_index : video_llm_pos_ids.shape[-1]
-                        ]
-                    )
-                if audio_data_index < audio_llm_pos_ids.shape[-1]:
-                    llm_pos_ids_list.append(
-                        audio_llm_pos_ids[
-                            :, audio_data_index : audio_llm_pos_ids.shape[-1]
-                        ]
-                    )
-                video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
-                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                eos_len = 1
-                eos_block = (
-                    torch.arange(eos_len, device=device, dtype=torch.long)
-                    .view(1, -1)
-                    .expand(3, -1)
-                    + st_idx
-                )
-                llm_pos_ids_list.append(eos_block)
-                llm_pos_ids_list.append(eos_block)
-                st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2  # noqa: E501
-                audio_idx += 1
-                video_idx += 1
-                remain_videos -= 1
-                remain_audios -= 1
-
-        if st < len(input_tokens):
-            st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-            text_len = len(input_tokens) - st
-            llm_pos_ids_list.append(
-                torch.arange(text_len, device=device, dtype=torch.long)
-                .view(1, -1)
-                .expand(3, -1)
-                + st_idx
-            )
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        if llm_positions.shape[1] != seq_len:
-            raise RuntimeError("Position ids length mismatch with input ids length")
-
-        position_ids = llm_positions.to(device=device, dtype=dtype)
-        mrope_position_delta = llm_positions.max() + 1 - seq_len
-        return position_ids, mrope_position_delta
-
     @classmethod
     def _omni_get_input_positions_tensor(
         cls,
@@ -1483,8 +1150,6 @@ class MRotaryEmbedding(RotaryEmbedding):
 
         # TODO(fyabc): refactor and share more code with
         #  _vl_get_input_positions_tensor.
-
-        model_type = hf_config.model_type
         thinker_config = hf_config.thinker_config
 
         if isinstance(image_grid_thw, list):
@@ -1492,30 +1157,6 @@ class MRotaryEmbedding(RotaryEmbedding):
         if isinstance(video_grid_thw, list):
             video_grid_thw = torch.tensor(video_grid_thw)
 
-        if "qwen3_omni" in model_type:
-            input_tensor = torch.tensor(input_tokens)
-            audio_lengths_tensor = audio_feature_lengths
-            if audio_lengths_tensor is not None and not isinstance(
-                audio_lengths_tensor, torch.Tensor
-            ):
-                audio_lengths_tensor = torch.as_tensor(
-                    audio_lengths_tensor, dtype=torch.long
-                )
-            second_per_grids_tensor = (
-                torch.tensor(second_per_grid_ts) if second_per_grid_ts else None
-            )
-
-            llm_positions, mrope_position_delta = cls._omni3_get_input_positions_tensor(  # noqa: E501
-                thinker_config,
-                input_tensor,
-                image_grid_thw,
-                video_grid_thw,
-                use_audio_in_video,
-                audio_lengths_tensor,
-                second_per_grids_tensor,
-            )
-            return llm_positions, mrope_position_delta
-
         audio_token_id = thinker_config.audio_token_index
         image_token_id = thinker_config.image_token_index
         video_token_id = thinker_config.video_token_index
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 8a5aa9c2be3bf..6eb9faabd1c7f 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -72,7 +72,12 @@ from vllm.multimodal.processing import (
 )
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsMRoPE,
+    SupportsMultiModal,
+    SupportsPP,
+)
 
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -96,7 +101,7 @@ from .utils import (
     _merge_multimodal_embeddings,
     maybe_prefix,
 )
-from .vision import get_vit_attn_backend
+from .vision import get_llm_pos_ids_for_vision, get_vit_attn_backend
 
 try:
     import flash_attn
@@ -106,6 +111,15 @@ except (ImportError, ModuleNotFoundError):
 logger = init_logger(__name__)
 
 
+def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
+    input_lengths_leave = input_lengths % 100
+    feat_lengths = (input_lengths_leave - 1) // 2 + 1
+    output_lengths = (
+        ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
+    )
+    return feat_lengths, output_lengths
+
+
 class Qwen3_VisionPatchEmbed(nn.Module):
     def __init__(
         self,
@@ -679,16 +693,6 @@ Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder
 class Qwen3OmniMoeThinkerMultiModalProcessor(
     Qwen2_5OmniThinkerMultiModalProcessor,
 ):
-    def _get_feat_extract_output_lengths(
-        self, input_lengths: torch.Tensor
-    ) -> torch.Tensor:
-        input_lengths_leave = input_lengths % 100
-        feat_lengths = (input_lengths_leave - 1) // 2 + 1
-        output_lengths = (
-            ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
-        )
-        return feat_lengths, output_lengths
-
     def _call_hf_processor(
         self,
         prompt: str,
@@ -882,13 +886,13 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
         if audio_feature_lengths is None and feature_attention_mask is None:
             audio_output_lengths = []
         elif audio_feature_lengths is not None:
-            _, audio_output_lens = self._get_feat_extract_output_lengths(
+            _, audio_output_lens = _get_feat_extract_output_lengths(
                 audio_feature_lengths
             )
             audio_output_lengths = audio_output_lens.tolist()
         elif feature_attention_mask is not None:
             assert isinstance(feature_attention_mask, torch.Tensor)
-            _, audio_output_lens = self._get_feat_extract_output_lengths(
+            _, audio_output_lens = _get_feat_extract_output_lengths(
                 feature_attention_mask.sum(-1)
             )
             audio_output_lengths = audio_output_lens.tolist()
@@ -1044,16 +1048,6 @@ class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMix
             else:
                 return torch.concat(mm_input, dim=dim)
 
-    def _get_feat_extract_output_lengths(
-        self, input_lengths: torch.Tensor
-    ) -> torch.Tensor:
-        input_lengths_leave = input_lengths % 100
-        feat_lengths = (input_lengths_leave - 1) // 2 + 1
-        output_lengths = (
-            ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
-        )
-        return output_lengths, output_lengths
-
     def _process_audio_input(
         self,
         audio_input: Qwen2AudioFeatureInputs,
@@ -1072,8 +1066,8 @@ class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMix
         if audio_feature_lengths.ndim == 2:
             audio_feature_lengths = audio_feature_lengths.reshape(-1)
 
-        audio_feat_lengths, audio_output_lengths = (
-            self._get_feat_extract_output_lengths(audio_feature_lengths)
+        audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths(
+            audio_feature_lengths
         )
 
         audio_outputs = self.audio_tower(
@@ -1094,6 +1088,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
     nn.Module,
     SupportsMultiModal,
     SupportsPP,
+    SupportsMRoPE,
     Qwen3OmniMoeConditionalGenerationMixin,
 ):
     hf_to_vllm_mapper = WeightsMapper(
@@ -1407,3 +1402,311 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
         loaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
         return loaded_weights
+
+    @classmethod
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        second_per_grid_ts: Optional[list[float]] = None,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        config = hf_config.thinker_config
+        if isinstance(image_grid_thw, list):
+            image_grid_thw = torch.tensor(image_grid_thw)
+        if isinstance(video_grid_thw, list):
+            video_grid_thw = torch.tensor(video_grid_thw)
+        input_ids = torch.tensor(input_tokens)
+        if input_ids is None or input_ids.ndim != 1:
+            raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids")
+
+        seq_len = input_ids.shape[0]
+        if audio_feature_lengths is not None and not isinstance(
+            audio_feature_lengths, torch.Tensor
+        ):
+            audio_feature_lengths = torch.as_tensor(
+                audio_feature_lengths, dtype=torch.long
+            )
+        if second_per_grid_ts is None:
+            if video_grid_thw is not None and video_grid_thw.numel() > 0:
+                second_per_grids = torch.ones(
+                    video_grid_thw.shape[0], dtype=torch.float32
+                )
+            else:
+                second_per_grids = torch.tensor([], dtype=torch.float32)
+        else:
+            second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
+
+        spatial_merge_size = config.vision_config.spatial_merge_size
+        image_token_id = config.image_token_id
+        video_token_id = config.video_token_id
+        audio_token_id = config.audio_token_id
+        vision_start_token_id = config.vision_start_token_id
+        audio_start_token_id = config.audio_start_token_id
+        position_id_per_seconds = config.position_id_per_seconds
+
+        vision_start_indices = torch.argwhere(
+            input_ids == vision_start_token_id
+        ).squeeze(1)
+        if vision_start_indices.numel() > 0:
+            vision_tokens = input_ids[vision_start_indices + 1]
+        else:
+            vision_tokens = input_ids.new_empty((0,), dtype=input_ids.dtype)
+        audio_nums = torch.sum(input_ids == audio_start_token_id)
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (
+            (vision_tokens == audio_start_token_id).sum()
+            if use_audio_in_video
+            else (vision_tokens == video_token_id).sum()
+        )
+
+        llm_pos_ids_list: list[torch.Tensor] = []
+        st = 0
+        image_idx = 0
+        video_idx = 0
+        audio_idx = 0
+        remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums  # noqa: E501
+        multimodal_nums = (
+            image_nums + audio_nums
+            if use_audio_in_video
+            else image_nums + video_nums + audio_nums
+        )  # noqa: E501
+
+        for _ in range(multimodal_nums):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+            if (image_token_id in input_tokens or video_token_id in input_tokens) and (
+                remain_videos > 0 or remain_images > 0
+            ):
+                ed_vision_start = input_tokens.index(vision_start_token_id, st)
+            else:
+                ed_vision_start = len(input_tokens) + 1
+            if audio_token_id in input_tokens and remain_audios > 0:
+                ed_audio_start = input_tokens.index(audio_start_token_id, st)
+            else:
+                ed_audio_start = len(input_tokens) + 1
+            min_ed = min(ed_vision_start, ed_audio_start)
+
+            if min_ed == ed_audio_start:
+                text_len = min_ed - st
+                if text_len != 0:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len, dtype=torch.long)
+                        .view(1, -1)
+                        .expand(3, -1)
+                        + st_idx
+                    )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                bos_len = 1
+                llm_pos_ids_list.append(
+                    torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                _, audio_len = _get_feat_extract_output_lengths(
+                    audio_feature_lengths[audio_idx]
+                )
+                llm_pos_ids = (
+                    torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                llm_pos_ids_list.append(llm_pos_ids)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                eos_len = 1
+                llm_pos_ids_list.append(
+                    torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                st += text_len + bos_len + audio_len + eos_len
+                audio_idx += 1
+                remain_audios -= 1
+            elif (
+                min_ed == ed_vision_start
+                and input_ids[ed_vision_start + 1] == image_token_id
+            ):
+                text_len = min_ed - st
+                if text_len != 0:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len, dtype=torch.long)
+                        .view(1, -1)
+                        .expand(3, -1)
+                        + st_idx
+                    )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                bos_len = 1
+                llm_pos_ids_list.append(
+                    torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                grid_t = image_grid_thw[image_idx][0]
+                grid_hs = image_grid_thw[:, 1]
+                grid_ws = image_grid_thw[:, 2]
+                t_index = torch.arange(grid_t) * position_id_per_seconds
+                llm_pos_ids = get_llm_pos_ids_for_vision(
+                    st_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                image_len = image_grid_thw[image_idx].prod() // (spatial_merge_size**2)
+                llm_pos_ids_list.append(llm_pos_ids)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                eos_len = 1
+                llm_pos_ids_list.append(
+                    torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                st += text_len + bos_len + image_len + eos_len
+                image_idx += 1
+                remain_images -= 1
+            elif (
+                min_ed == ed_vision_start
+                and input_ids[ed_vision_start + 1] == video_token_id
+                and not use_audio_in_video
+            ):
+                text_len = min_ed - st
+                if text_len != 0:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len, dtype=torch.long)
+                        .view(1, -1)
+                        .expand(3, -1)
+                        + st_idx
+                    )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                bos_len = 1
+                llm_pos_ids_list.append(
+                    torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = (
+                    torch.arange(grid_t)
+                    * float(second_per_grids[video_idx].item())
+                    * position_id_per_seconds
+                )
+                llm_pos_ids = get_llm_pos_ids_for_vision(
+                    st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
+                llm_pos_ids_list.append(llm_pos_ids)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                eos_len = 1
+                llm_pos_ids_list.append(
+                    torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                st += text_len + bos_len + video_len + eos_len
+                video_idx += 1
+                remain_videos -= 1
+            elif (
+                min_ed == ed_vision_start
+                and ed_vision_start + 1 == ed_audio_start
+                and use_audio_in_video
+            ):
+                text_len = min_ed - st
+                if text_len != 0:
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len, dtype=torch.long)
+                        .view(1, -1)
+                        .expand(3, -1)
+                        + st_idx
+                    )
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                bos_len = 1
+                bos_block = (
+                    torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                llm_pos_ids_list.append(bos_block)
+                llm_pos_ids_list.append(bos_block)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                _, audio_len = _get_feat_extract_output_lengths(
+                    audio_feature_lengths[audio_idx]
+                )
+                audio_llm_pos_ids = (
+                    torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = (
+                    torch.arange(grid_t)
+                    * float(second_per_grids[video_idx].item())
+                    * position_id_per_seconds
+                )
+                video_llm_pos_ids = get_llm_pos_ids_for_vision(
+                    st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                video_data_index, audio_data_index = 0, 0
+                while (
+                    video_data_index < video_llm_pos_ids.shape[-1]
+                    and audio_data_index < audio_llm_pos_ids.shape[-1]
+                ):
+                    if (
+                        video_llm_pos_ids[0][video_data_index]
+                        <= audio_llm_pos_ids[0][audio_data_index]
+                    ):
+                        llm_pos_ids_list.append(
+                            video_llm_pos_ids[
+                                :, video_data_index : video_data_index + 1
+                            ]
+                        )
+                        video_data_index += 1
+                    else:
+                        llm_pos_ids_list.append(
+                            audio_llm_pos_ids[
+                                :, audio_data_index : audio_data_index + 1
+                            ]
+                        )
+                        audio_data_index += 1
+                if video_data_index < video_llm_pos_ids.shape[-1]:
+                    llm_pos_ids_list.append(
+                        video_llm_pos_ids[
+                            :, video_data_index : video_llm_pos_ids.shape[-1]
+                        ]
+                    )
+                if audio_data_index < audio_llm_pos_ids.shape[-1]:
+                    llm_pos_ids_list.append(
+                        audio_llm_pos_ids[
+                            :, audio_data_index : audio_llm_pos_ids.shape[-1]
+                        ]
+                    )
+                video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2)
+                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+                eos_len = 1
+                eos_block = (
+                    torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                    + st_idx
+                )
+                llm_pos_ids_list.append(eos_block)
+                llm_pos_ids_list.append(eos_block)
+                st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2  # noqa: E501
+                audio_idx += 1
+                video_idx += 1
+                remain_videos -= 1
+                remain_audios -= 1
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1)
+                + st_idx
+            )
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        if llm_positions.shape[1] != seq_len:
+            raise RuntimeError("Position ids length mismatch with input ids length")
+
+        mrope_position_delta = llm_positions.max() + 1 - seq_len
+        return llm_positions, mrope_position_delta
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 74262f8b94a68..e517109e94dd6 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -499,3 +499,40 @@ def run_dp_sharded_mrope_vision_model(
         "Found unassigned embeddings"
     )
     return out_embeddings
+
+
+def get_llm_pos_ids_for_vision(
+    start_idx: int,
+    vision_idx: int,
+    spatial_merge_size: int,
+    t_index: list[int],
+    grid_hs: torch.Tensor,
+    grid_ws: torch.Tensor,
+) -> torch.Tensor:
+    llm_pos_ids_list = []
+    llm_grid_h = grid_hs[vision_idx] // spatial_merge_size
+    llm_grid_w = grid_ws[vision_idx] // spatial_merge_size
+    h_index = (
+        torch.arange(llm_grid_h)
+        .view(1, -1, 1)
+        .expand(len(t_index), -1, llm_grid_w)
+        .flatten()
+    )
+    w_index = (
+        torch.arange(llm_grid_w)
+        .view(1, 1, -1)
+        .expand(len(t_index), llm_grid_h, -1)
+        .flatten()
+    )
+    t_index_tensor = (
+        torch.Tensor(t_index)
+        .to(llm_grid_h.device)
+        .view(-1, 1)
+        .expand(-1, llm_grid_h * llm_grid_w)
+        .long()
+        .flatten()
+    )
+    _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index])
+    llm_pos_ids_list.append(_llm_pos_ids + start_idx)
+    llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
+    return llm_pos_ids
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2dce58237c7b0..a323835e575cc 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -875,7 +875,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             if mm_input.get("use_audio_in_video") is True:
                 use_audio_in_video = True
 
-        if supports_mrope(self.model):
+        if supports_mrope(self.get_model()):
             req_state.mrope_positions, req_state.mrope_position_delta = (
                 self.model.get_mrope_input_positions(
                     req_state.prompt_token_ids,

From 55392bc87932da63b888e58f371fe4b67b438496 Mon Sep 17 00:00:00 2001
From: "sangho.lee" <sanghol@allenai.org>
Date: Sat, 11 Oct 2025 00:28:23 -0500
Subject: [PATCH 05/30] [Bugfix][Multi Modal] Fix incorrect Molmo image
 processing (#26563)

Signed-off-by: sanghol <sanghol@allenai.org>
---
 vllm/model_executor/models/molmo.py | 40 +++++++++++++++++------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 734841d0dc983..f1dd06f3a0650 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -114,11 +114,11 @@ class MolmoImageInputs(TensorSchema):
         TensorShape("bn", "nc", "np", dynamic_dims={"nc"}),
     ]
 
-    feat_is_patch: Annotated[
+    image_input_idx: Annotated[
         Union[torch.Tensor, list[torch.Tensor]],
         TensorShape("bn", "nc", "tp", dynamic_dims={"nc"}),
     ]
-    # A boolean mask indicating which image features correspond to patch tokens.
+    # An index tensor that maps image features to their corresponding patch tokens.
     num_crops: Annotated[torch.Tensor, TensorShape("bn")]
 
 
@@ -1177,7 +1177,7 @@ class MolmoProcessorWrapper:
             num_crops = torch.tensor(tilings).prod(-1) + 1
             assert num_crops.sum() == len(feat_is_patch)
 
-            outputs["feat_is_patch"] = feat_is_patch
+            outputs["image_input_idx"] = image_input_idx
             outputs["num_crops"] = num_crops
             outputs["img_patch_id"] = self.image_patch_id
 
@@ -1211,8 +1211,9 @@ class MolmoProcessingInfo(BaseProcessingInfo):
         image_token_length_w = processor.image_token_length_w
         image_token_length_h = processor.image_token_length_h
 
-        extra = image_token_length_w * image_token_length_h
-        joint = ((ncols + 1) // pooling_size) * ((nrows + 1) // pooling_size)
+        # Calculate total tokens: 2 for start/end + (w+1)*h for column separators
+        extra = 2 + (image_token_length_w + 1) * image_token_length_h
+        joint = 2 + ((ncols + 1) // pooling_size + 1) * ((nrows + 1) // pooling_size)
 
         return extra + joint
 
@@ -1299,7 +1300,7 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
         return dict(
             images=MultiModalFieldConfig.flat_from_sizes("image", num_crops),
             image_masks=MultiModalFieldConfig.flat_from_sizes("image", num_crops),
-            feat_is_patch=MultiModalFieldConfig.flat_from_sizes("image", num_crops),
+            image_input_idx=MultiModalFieldConfig.flat_from_sizes("image", num_crops),
             num_crops=MultiModalFieldConfig.batched("image"),
             img_patch_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -1444,7 +1445,7 @@ class MolmoForCausalLM(
     ) -> Optional[MolmoImageInputs]:
         images = kwargs.pop("images", None)
         image_masks = kwargs.pop("image_masks", None)
-        feat_is_patch = kwargs.pop("feat_is_patch", None)
+        image_input_idx = kwargs.pop("image_input_idx", None)
         num_crops = kwargs.pop("num_crops", None)
 
         if images is None:
@@ -1466,7 +1467,7 @@ class MolmoForCausalLM(
         return MolmoImageInputs(
             images=images,
             image_masks=image_masks,
-            feat_is_patch=feat_is_patch,
+            image_input_idx=image_input_idx,
             num_crops=num_crops,
         )
 
@@ -1476,7 +1477,7 @@ class MolmoForCausalLM(
     ) -> list[torch.Tensor]:
         images = image_input["images"]
         image_masks = image_input["image_masks"]
-        feat_is_patch = image_input["feat_is_patch"]
+        image_input_idx = image_input["image_input_idx"]
         num_crops = image_input["num_crops"]
 
         # Call the vision backbone on the whole batch at once
@@ -1484,7 +1485,7 @@ class MolmoForCausalLM(
         image_masks_flat = (
             None if image_masks is None else flatten_bn(image_masks, concat=True)
         )
-        feat_is_patch_flat = flatten_bn(feat_is_patch, concat=True)
+        image_input_idx_flat = flatten_bn(image_input_idx, concat=True)
 
         image_features_flat = self.vision_backbone(
             images=images_flat.unsqueeze(0),
@@ -1494,13 +1495,18 @@ class MolmoForCausalLM(
         ).squeeze(0)
 
         # Only the features corresponding to patch tokens are relevant
-        return [
-            feats[f_is_patch]
-            for feats, f_is_patch in zip(
-                image_features_flat.split(num_crops.tolist()),
-                feat_is_patch_flat.split(num_crops.tolist()),
-            )
-        ]
+        # Re-order the features using the image_input_idx tensor
+        results = []
+        num_crops_list = num_crops.tolist()
+        for feats, img_idx in zip(
+            image_features_flat.split(num_crops_list),
+            image_input_idx_flat.split(num_crops_list),
+        ):
+            is_valid = img_idx >= 0
+            valid_img_idx = img_idx[is_valid]
+            order = torch.argsort(valid_img_idx)
+            results.append(feats[is_valid][order])
+        return results
 
     def get_language_model(self) -> torch.nn.Module:
         return self.model

From 727144bed10ffd465e37d47a5a60747efc15368b Mon Sep 17 00:00:00 2001
From: dsinghvi <divyanshsinghvi@gmail.com>
Date: Sat, 11 Oct 2025 12:51:04 +0530
Subject: [PATCH 06/30] [Refactor]: Use M-RoPE interface directly while
 defining model class instead of maintaining model specific M-RoPE
 implementation in mrope.py (#24172)

Signed-off-by: Divyansh Singhvi <divyanshsinghvi@gmail.com>
Signed-off-by: dsinghvi <divyanshsinghvi@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: wwl2755 <wangwenlong2755@gmail.com>
---
 .../layers/rotary_embedding/mrope.py          | 1015 -----------------
 vllm/model_executor/models/ernie45_vl.py      |  151 ++-
 vllm/model_executor/models/glm4v.py           |  152 ++-
 vllm/model_executor/models/keye_vl1_5.py      |  144 ++-
 .../models/qwen2_5_omni_thinker.py            |  271 ++++-
 vllm/model_executor/models/qwen2_5_vl.py      |  130 ++-
 vllm/model_executor/models/qwen3_vl.py        |  115 +-
 vllm/model_executor/models/utils.py           |    8 +
 vllm/v1/worker/gpu_model_runner.py            |   39 +-
 9 files changed, 974 insertions(+), 1051 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index ebfe9257c6c45..fce110e6a5270 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import itertools
 from typing import Optional, Union
 
 import numpy as np
@@ -411,969 +410,6 @@ class MRotaryEmbedding(RotaryEmbedding):
 
         return llm_positions.tolist(), mrope_position_delta
 
-    @classmethod
-    def get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        second_per_grid_ts: list[float],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
-        use_audio_in_video: bool = False,
-    ) -> tuple[torch.Tensor, int]:
-        from vllm.transformers_utils.config import thinker_uses_mrope
-
-        if thinker_uses_mrope(hf_config) and hf_config.model_type == "qwen2_5_omni":
-            return cls._omni_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                context_len=context_len,
-                seq_len=seq_len,
-                audio_feature_lengths=audio_feature_lengths,
-                use_audio_in_video=use_audio_in_video,
-            )
-        elif hf_config.model_type in ["glm4v", "glm4v_moe"]:
-            return cls._glm4v_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                context_len=context_len,
-                seq_len=seq_len,
-            )
-        elif hf_config.model_type in ["qwen3_vl", "qwen3_vl_moe"]:
-            return cls._qwen3vl_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                context_len=context_len,
-                seq_len=seq_len,
-            )
-        elif hf_config.model_type in ["ernie4_5_moe_vl", "ernie4_5_vl"]:
-            return cls._ernie_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                context_len=context_len,
-                seq_len=seq_len,
-            )
-        elif "KeyeVL1_5" in hf_config.model_type:
-            return cls._keye_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                context_len=context_len,
-                seq_len=seq_len,
-            )
-        else:
-            return cls._vl_get_input_positions_tensor(
-                input_tokens=input_tokens,
-                hf_config=hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                context_len=context_len,
-                seq_len=seq_len,
-            )
-
-    @classmethod
-    def _glm4v_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value for GLM4V."""
-
-        image_token_id = hf_config.image_token_id
-        video_start_token_id = hf_config.video_start_token_id
-        video_end_token_id = hf_config.video_end_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-        llm_pos_ids_list: list = []
-
-        if not (image_grid_thw is None and video_grid_thw is None):
-            if isinstance(image_grid_thw, torch.Tensor):
-                image_grid_thw = image_grid_thw.tolist()
-
-            input_token_type: list[str] = []
-            video_check_flg = False
-            for token in input_tokens:
-                if token == video_start_token_id:
-                    video_check_flg = True
-                elif token == video_end_token_id:
-                    video_check_flg = False
-
-                if (token == image_token_id) and (video_check_flg is False):
-                    input_token_type.append("image")
-                elif (token == image_token_id) and (video_check_flg is True):
-                    input_token_type.append("video")
-                else:
-                    input_token_type.append("text")
-
-            input_type_group: list[tuple[str, int, int]] = []
-            for key, group_iter in itertools.groupby(
-                enumerate(input_token_type), lambda x: x[1]
-            ):
-                group_list = list(group_iter)
-                start_index = group_list[0][0]
-                end_index = group_list[-1][0] + 1
-                input_type_group.append((key, start_index, end_index))
-
-            video_frame_num = 1
-            mm_data_idx = 0
-            for modality_type, start_idx, end_idx in input_type_group:
-                st_idx = (
-                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-                )
-                if modality_type == "image":
-                    t, h, w = (
-                        image_grid_thw[mm_data_idx][0],
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
-                    llm_grid_t, llm_grid_h, llm_grid_w = (
-                        t,
-                        h // spatial_merge_size,
-                        w // spatial_merge_size,
-                    )
-
-                    t_index = (
-                        torch.arange(llm_grid_t)
-                        .view(-1, 1)
-                        .expand(-1, llm_grid_h * llm_grid_w)
-                        .flatten()
-                    )
-                    h_index = (
-                        torch.arange(llm_grid_h)
-                        .view(1, -1, 1)
-                        .expand(llm_grid_t, -1, llm_grid_w)
-                        .flatten()
-                    )
-                    w_index = (
-                        torch.arange(llm_grid_w)
-                        .view(1, 1, -1)
-                        .expand(llm_grid_t, llm_grid_h, -1)
-                        .flatten()
-                    )
-                    llm_pos_ids_list.append(
-                        torch.stack([t_index, h_index, w_index]) + st_idx
-                    )
-                    mm_data_idx += 1
-
-                elif modality_type == "video":
-                    t, h, w = (
-                        video_frame_num,
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
-                    llm_grid_t, llm_grid_h, llm_grid_w = (
-                        t,
-                        h // spatial_merge_size,
-                        w // spatial_merge_size,
-                    )
-
-                    for t_idx in range(llm_grid_t):
-                        t_index = (
-                            torch.tensor(t_idx)
-                            .view(-1, 1)
-                            .expand(-1, llm_grid_h * llm_grid_w)
-                            .flatten()
-                        )
-                        h_index = (
-                            torch.arange(llm_grid_h)
-                            .view(1, -1, 1)
-                            .expand(1, -1, llm_grid_w)
-                            .flatten()
-                        )
-                        w_index = (
-                            torch.arange(llm_grid_w)
-                            .view(1, 1, -1)
-                            .expand(1, llm_grid_h, -1)
-                            .flatten()
-                        )
-                        llm_pos_ids_list.append(
-                            torch.stack([t_index, h_index, w_index]) + st_idx
-                        )
-
-                    mm_data_idx += 1
-                    video_frame_num += 1
-
-                else:
-                    text_len = end_idx - start_idx
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
-                    )
-                    video_frame_num = 1
-
-        else:
-            text_len = len(input_tokens)
-            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:seq_len]
-        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        return llm_positions, mrope_position_delta
-
-    @classmethod
-    def _qwen3vl_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value."""
-
-        video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)]
-
-        image_token_id = hf_config.image_token_id
-        video_token_id = hf_config.video_token_id
-        vision_start_token_id = hf_config.vision_start_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-
-        input_tokens_tensor = torch.tensor(input_tokens)
-        vision_start_indices = torch.argwhere(
-            input_tokens_tensor == vision_start_token_id
-        ).squeeze(1)
-        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (vision_tokens == video_token_id).sum()
-        llm_pos_ids_list: list = []
-
-        st = 0
-        remain_images, remain_videos = image_nums, video_nums
-
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + video_nums):
-            if image_token_id in input_tokens and remain_images > 0:
-                ed_image = input_tokens.index(image_token_id, st)
-            else:
-                ed_image = len(input_tokens) + 1
-            if video_token_id in input_tokens and remain_videos > 0:
-                ed_video = input_tokens.index(video_token_id, st)
-            else:
-                ed_video = len(input_tokens) + 1
-            if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
-                image_index += 1
-                remain_images -= 1
-                ed = ed_image
-            else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
-                video_index += 1
-                remain_videos -= 1
-                ed = ed_video
-
-            llm_grid_t, llm_grid_h, llm_grid_w = (
-                t,
-                h // spatial_merge_size,
-                w // spatial_merge_size,
-            )
-            text_len = ed - st
-
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
-            )
-
-            t_index = (
-                torch.arange(llm_grid_t)
-                .view(-1, 1)
-                .expand(-1, llm_grid_h * llm_grid_w)
-                .flatten()
-            )
-            h_index = (
-                torch.arange(llm_grid_h)
-                .view(1, -1, 1)
-                .expand(llm_grid_t, -1, llm_grid_w)
-                .flatten()
-            )
-            w_index = (
-                torch.arange(llm_grid_w)
-                .view(1, 1, -1)
-                .expand(llm_grid_t, llm_grid_h, -1)
-                .flatten()
-            )
-            llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
-            )
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-
-        if st < len(input_tokens):
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            text_len = len(input_tokens) - st
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
-            )
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
-        return llm_positions, mrope_position_delta
-
-    @classmethod
-    def _ernie_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value for Ernie VL."""
-
-        image_token_id = hf_config.im_patch_id
-        video_start_token_id = hf_config.video_start_token_id
-        video_end_token_id = hf_config.video_end_token_id
-        spatial_conv_size = hf_config.spatial_conv_size
-        temporal_conv_size = hf_config.temporal_conv_size
-        llm_pos_ids_list: list = []
-
-        if not (image_grid_thw is None and video_grid_thw is None):
-            if isinstance(image_grid_thw, torch.Tensor):
-                image_grid_thw = image_grid_thw.tolist()
-
-            input_token_type: list[str] = []
-            video_check_flg = False
-            for token in input_tokens:
-                if token == video_start_token_id:
-                    video_check_flg = True
-                elif token == video_end_token_id:
-                    video_check_flg = False
-
-                if (token == image_token_id) and (video_check_flg is False):
-                    input_token_type.append("image")
-                elif (token == image_token_id) and (video_check_flg is True):
-                    input_token_type.append("video")
-                else:
-                    input_token_type.append("text")
-
-            input_type_group: list[tuple[str, int, int]] = []
-            for key, group_iter in itertools.groupby(
-                enumerate(input_token_type), lambda x: x[1]
-            ):
-                group_list = list(group_iter)
-                start_index = group_list[0][0]
-                end_index = group_list[-1][0] + 1
-                input_type_group.append((key, start_index, end_index))
-
-            video_frame_num = 1
-            mm_data_idx = 0
-            for modality_type, start_idx, end_idx in input_type_group:
-                st_idx = (
-                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-                )
-                if modality_type == "image":
-                    t, h, w = (
-                        image_grid_thw[mm_data_idx][0],
-                        image_grid_thw[mm_data_idx][1],
-                        image_grid_thw[mm_data_idx][2],
-                    )
-                    llm_grid_t, llm_grid_h, llm_grid_w = (
-                        t,
-                        h // spatial_conv_size,
-                        w // spatial_conv_size,
-                    )
-
-                    t_index = (
-                        torch.arange(llm_grid_t)
-                        .view(-1, 1)
-                        .expand(-1, llm_grid_h * llm_grid_w)
-                        .flatten()
-                    )
-                    h_index = (
-                        torch.arange(llm_grid_h)
-                        .view(1, -1, 1)
-                        .expand(llm_grid_t, -1, llm_grid_w)
-                        .flatten()
-                    )
-                    w_index = (
-                        torch.arange(llm_grid_w)
-                        .view(1, 1, -1)
-                        .expand(llm_grid_t, llm_grid_h, -1)
-                        .flatten()
-                    )
-                    llm_pos_ids_list.append(
-                        torch.stack([t_index, h_index, w_index]) + st_idx
-                    )
-                    mm_data_idx += 1
-
-                elif modality_type == "video":
-                    t, h, w = (
-                        video_grid_thw[mm_data_idx][0],
-                        video_grid_thw[mm_data_idx][1],
-                        video_grid_thw[mm_data_idx][2],
-                    )
-                    llm_grid_t, llm_grid_h, llm_grid_w = (
-                        t // temporal_conv_size,
-                        h // spatial_conv_size,
-                        w // spatial_conv_size,
-                    )
-
-                    for t_idx in range(llm_grid_t):
-                        t_index = (
-                            torch.tensor(t_idx)
-                            .view(-1, 1)
-                            .expand(-1, llm_grid_h * llm_grid_w)
-                            .flatten()
-                        )
-                        h_index = (
-                            torch.arange(llm_grid_h)
-                            .view(1, -1, 1)
-                            .expand(1, -1, llm_grid_w)
-                            .flatten()
-                        )
-                        w_index = (
-                            torch.arange(llm_grid_w)
-                            .view(1, 1, -1)
-                            .expand(1, llm_grid_h, -1)
-                            .flatten()
-                        )
-                        llm_pos_ids_list.append(
-                            torch.stack([t_index, h_index, w_index]) + st_idx
-                        )
-
-                    mm_data_idx += 1
-                    video_frame_num += 1
-
-                else:
-                    text_len = end_idx - start_idx
-                    llm_pos_ids_list.append(
-                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
-                    )
-                    video_frame_num = 1
-
-        else:
-            text_len = len(input_tokens)
-            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:seq_len]
-        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        return llm_positions, mrope_position_delta
-
-    @classmethod
-    def _keye_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, int]:
-        if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
-            video_grid_thw = video_grid_thw[0]
-        """Get mrope input positions and delta value (Keye series)."""
-
-        def split_thw(grid_thw: Union[torch.Tensor, list[int]]) -> list[list[int]]:
-            """
-            Split grid_thw along the t dimension.
-
-            Args:
-                grid_thw: shape [N, 3] tensor or nested list of [t, h, w].
-
-            Returns:
-                List of [1, h, w] rows, repeated t times for each original row.
-            """
-
-            if isinstance(grid_thw, list):
-                grid_thw = torch.tensor(grid_thw, dtype=torch.long)
-
-            if grid_thw.numel() == 0:
-                return []
-
-            t, hw = grid_thw[:, 0], grid_thw[:, 1:]
-            ones = torch.ones_like(hw[:, :1])  # [N,1]
-            out = torch.cat([ones, hw], dim=1).repeat_interleave(t, dim=0)
-            return out.tolist()
-
-        video_grid_thw = split_thw(video_grid_thw)
-
-        image_token_id = hf_config.image_token_id
-        video_token_id = hf_config.video_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-
-        image_nums = len(image_grid_thw)
-        frame_nums = len(video_grid_thw)
-        llm_pos_ids_list: list = []
-
-        st = 0
-        remain_images, remain_frames = image_nums, frame_nums
-
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + frame_nums):
-            if remain_images > 0:
-                try:
-                    ed_image = input_tokens.index(image_token_id, st)
-                except ValueError:
-                    ed_image = len(input_tokens) + 1
-            else:
-                ed_image = len(input_tokens) + 1
-            if remain_frames > 0:
-                try:
-                    ed_video = input_tokens.index(video_token_id, st)
-                except ValueError:
-                    ed_video = len(input_tokens) + 1
-            else:
-                ed_video = len(input_tokens) + 1
-
-            if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
-                image_index += 1
-                remain_images -= 1
-                ed = ed_image
-            else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
-                video_index += 1
-                remain_frames -= 1
-                ed = ed_video
-
-            llm_grid_t, llm_grid_h, llm_grid_w = (
-                t,
-                h // spatial_merge_size,
-                w // spatial_merge_size,
-            )
-            text_len = ed - st
-
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
-            )
-            t_index = (
-                torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
-            ).flatten()
-            h_index = (
-                torch.arange(llm_grid_h)
-                .view(1, -1, 1)
-                .expand(llm_grid_t, -1, llm_grid_w)
-                .flatten()
-            )
-            w_index = (
-                torch.arange(llm_grid_w)
-                .view(1, 1, -1)
-                .expand(llm_grid_t, llm_grid_h, -1)
-                .flatten()
-            )
-            llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
-            )
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-
-        if st < len(input_tokens):
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            text_len = len(input_tokens) - st
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
-            )
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
-
-        return llm_positions, mrope_position_delta
-
-    @classmethod
-    def _vl_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        second_per_grid_ts: list[float],
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value."""
-
-        image_token_id = hf_config.image_token_id
-        video_token_id = hf_config.video_token_id
-        vision_start_token_id = hf_config.vision_start_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-        tokens_per_second = getattr(hf_config.vision_config, "tokens_per_second", 1.0)
-
-        input_tokens_tensor = torch.tensor(input_tokens)
-        vision_start_indices = torch.argwhere(
-            input_tokens_tensor == vision_start_token_id
-        ).squeeze(1)
-        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
-        image_nums = (vision_tokens == image_token_id).sum()
-        video_nums = (vision_tokens == video_token_id).sum()
-        llm_pos_ids_list: list = []
-
-        st = 0
-        remain_images, remain_videos = image_nums, video_nums
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + video_nums):
-            video_second_per_grid_t = 0.0
-            if remain_images > 0:
-                try:
-                    ed_image = input_tokens.index(image_token_id, st)
-                except ValueError:
-                    ed_image = len(input_tokens) + 1
-            else:
-                ed_image = len(input_tokens) + 1
-            if remain_videos > 0:
-                try:
-                    ed_video = input_tokens.index(video_token_id, st)
-                except ValueError:
-                    ed_video = len(input_tokens) + 1
-            else:
-                ed_video = len(input_tokens) + 1
-            if ed_image < ed_video:
-                t, h, w = (
-                    image_grid_thw[image_index][0],
-                    image_grid_thw[image_index][1],
-                    image_grid_thw[image_index][2],
-                )
-                image_index += 1
-                remain_images -= 1
-                ed = ed_image
-            else:
-                t, h, w = (
-                    video_grid_thw[video_index][0],
-                    video_grid_thw[video_index][1],
-                    video_grid_thw[video_index][2],
-                )
-                video_second_per_grid_t = 1.0
-                if second_per_grid_ts:
-                    video_second_per_grid_t = second_per_grid_ts[video_index]
-                video_index += 1
-                remain_videos -= 1
-                ed = ed_video
-
-            llm_grid_t, llm_grid_h, llm_grid_w = (
-                t,
-                h // spatial_merge_size,
-                w // spatial_merge_size,
-            )
-            text_len = ed - st
-
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
-            )
-            t_index = (
-                torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
-                * video_second_per_grid_t
-                * tokens_per_second
-            ).flatten()
-            h_index = (
-                torch.arange(llm_grid_h)
-                .view(1, -1, 1)
-                .expand(llm_grid_t, -1, llm_grid_w)
-                .flatten()
-            )
-            w_index = (
-                torch.arange(llm_grid_w)
-                .view(1, 1, -1)
-                .expand(llm_grid_t, llm_grid_h, -1)
-                .flatten()
-            )
-            llm_pos_ids_list.append(
-                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
-            )
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
-
-        if st < len(input_tokens):
-            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            text_len = len(input_tokens) - st
-            llm_pos_ids_list.append(
-                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
-            )
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
-        llm_positions = llm_positions[:, context_len:seq_len]
-
-        return llm_positions, mrope_position_delta
-
-    @classmethod
-    def _omni_get_input_positions_tensor(
-        cls,
-        input_tokens: list[int],
-        hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        second_per_grid_ts: Optional[list[float]] = None,
-        context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
-        use_audio_in_video: bool = False,
-    ) -> tuple[torch.Tensor, int]:
-        """Get mrope input positions and delta value (Qwen2.5-Omni version).
-
-        Differences from MRotaryEmbedding:
-            1. Add audio support (and related `audio_feature_lengths`).
-            2. Add `use_audio_in_video` option to read audio from video inputs.
-                In this case, audio and vision position ids will be split into
-                chunks and interleaved.
-
-        Example:
-
-            (V_i are vision position ids, A_i are audio position ids)
-
-            |V_1 ...    V_n|A_1 ...   A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
-            |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
-        """
-
-        # TODO(fyabc): refactor and share more code with
-        #  _vl_get_input_positions_tensor.
-        thinker_config = hf_config.thinker_config
-
-        if isinstance(image_grid_thw, list):
-            image_grid_thw = torch.tensor(image_grid_thw)
-        if isinstance(video_grid_thw, list):
-            video_grid_thw = torch.tensor(video_grid_thw)
-
-        audio_token_id = thinker_config.audio_token_index
-        image_token_id = thinker_config.image_token_index
-        video_token_id = thinker_config.video_token_index
-        audio_start_token_id = thinker_config.audio_start_token_id
-        audio_end_token_id = thinker_config.audio_end_token_id
-        vision_start_token_id = thinker_config.vision_start_token_id
-        vision_end_token_id = thinker_config.vision_end_token_id
-        seconds_per_chunk = thinker_config.seconds_per_chunk
-        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
-        tokens_per_second = getattr(
-            thinker_config.vision_config, "tokens_per_second", 25
-        )
-
-        src_item = input_tokens
-        audio_seqlens = audio_feature_lengths
-        if not second_per_grid_ts:
-            second_per_grid_ts = [1] * video_grid_thw.shape[0]
-        audio_idx = 0
-        video_idx = 0
-        image_idx = 0
-        new_src_item: list[int] = []
-        llm_pos_ids_list: list[torch.Tensor] = []
-
-        idx = 0
-        while idx < len(src_item):
-            new_src_item_len = len(new_src_item)
-            start_idx = (
-                llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
-            )
-            if src_item[idx] not in [audio_token_id, video_token_id, image_token_id]:
-                if use_audio_in_video and idx > 0:
-                    if (
-                        src_item[idx] == vision_end_token_id
-                        and src_item[idx - 1] == audio_end_token_id
-                    ):
-                        # processing the <|audio_eos|> before <|vision_eos|>
-                        start_idx -= 1
-                    elif (
-                        src_item[idx] == audio_start_token_id
-                        and src_item[idx - 1] == vision_start_token_id
-                    ):
-                        # processing the <|audio_bos|> after <|vision_eos|>
-                        start_idx -= 1
-                new_src_item.append(src_item[idx])
-                llm_pos_ids = torch.tensor([start_idx], dtype=torch.long).expand(3, -1)
-                llm_pos_ids_list.append(llm_pos_ids)
-            elif src_item[idx] == audio_token_id:
-                assert audio_seqlens is not None
-                audio_seqlen = audio_seqlens[audio_idx]
-                place_num = ((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1
-                new_src_item.extend([audio_token_id] * place_num)
-                llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx
-                llm_pos_ids_list.append(llm_pos_ids)
-                audio_idx += 1
-            elif src_item[idx] == image_token_id:
-                grid_t = image_grid_thw[image_idx][0]
-                grid_hs = image_grid_thw[:, 1]
-                grid_ws = image_grid_thw[:, 2]
-                t_index = torch.arange(grid_t) * 1 * tokens_per_second
-                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
-                    start_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws
-                )
-                llm_pos_ids_list.append(llm_pos_ids)
-                vision_seqlen = image_grid_thw[image_idx].prod() // (
-                    spatial_merge_size**2
-                )
-                new_src_item.extend([image_token_id] * vision_seqlen)
-                image_idx += 1
-            elif src_item[idx] == video_token_id and not use_audio_in_video:
-                grid_t = video_grid_thw[video_idx][0]
-                grid_hs = video_grid_thw[:, 1]
-                grid_ws = video_grid_thw[:, 2]
-                t_index = (
-                    torch.arange(grid_t)
-                    * second_per_grid_ts[video_idx]
-                    * tokens_per_second
-                )
-                llm_pos_ids = cls._get_llm_pos_ids_for_vision(
-                    start_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
-                )
-                llm_pos_ids_list.append(llm_pos_ids)
-                vision_seqlen = video_grid_thw[video_idx].prod() // (
-                    spatial_merge_size**2
-                )
-                new_src_item.extend([video_token_id] * vision_seqlen)
-                video_idx += 1
-            else:
-                # read audio from video
-                assert audio_seqlens is not None
-                audio_seqlen = audio_seqlens[audio_idx]
-                vision_seqlen = video_grid_thw[video_idx].prod() // (
-                    spatial_merge_size**2
-                )
-                grid_t = video_grid_thw[video_idx][0]
-                grid_h = video_grid_thw[video_idx][1]
-                grid_w = video_grid_thw[video_idx][2]
-                grid_hs = video_grid_thw[:, 1]
-                grid_ws = video_grid_thw[:, 2]
-                t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
-                t_index = (
-                    torch.arange(grid_t)
-                    * second_per_grid_ts[video_idx]
-                    * tokens_per_second
-                )
-                t_index_split_chunk = cls._split_list_into_ranges(
-                    t_index, t_ntoken_per_chunk
-                )
-                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2
-                pure_audio_len = place_num - 2
-                added_audio_len = 0
-                audio_llm_pos_ids_list: list[torch.Tensor] = []
-                for t_chunk in t_index_split_chunk:
-                    vision_ntoken_per_chunk = (
-                        len(t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
-                    )
-                    new_src_item.extend([video_token_id] * vision_ntoken_per_chunk)
-                    vision_llm_pos_ids_list = cls._get_llm_pos_ids_for_vision(
-                        start_idx,
-                        video_idx,
-                        spatial_merge_size,
-                        t_chunk,
-                        grid_hs,
-                        grid_ws,
-                    ).split(1, dim=1)
-                    llm_pos_ids_list.extend(vision_llm_pos_ids_list)
-                    new_src_item.extend(
-                        min(t_ntoken_per_chunk, pure_audio_len - added_audio_len)
-                        * [audio_token_id]
-                    )
-                    audio_start_idx = (
-                        start_idx
-                        if len(audio_llm_pos_ids_list) == 0
-                        else audio_llm_pos_ids_list[-1][0].item() + 1
-                    )
-                    if min(t_ntoken_per_chunk, pure_audio_len - added_audio_len) > 0:
-                        audio_llm_pos_ids_list = (
-                            torch.arange(
-                                min(
-                                    t_ntoken_per_chunk, pure_audio_len - added_audio_len
-                                )
-                            ).expand(3, -1)
-                            + audio_start_idx
-                        ).split(1, dim=1)
-                    else:
-                        audio_llm_pos_ids_list = []
-                    added_audio_len += min(
-                        t_ntoken_per_chunk, pure_audio_len - added_audio_len
-                    )
-                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
-                if added_audio_len < pure_audio_len:
-                    new_src_item.extend(
-                        (pure_audio_len - added_audio_len) * [audio_token_id]
-                    )
-                    audio_llm_pos_ids_list = (
-                        torch.arange(pure_audio_len - added_audio_len).expand(3, -1)
-                        + llm_pos_ids_list[-1].max()
-                        + 1
-                    ).split(1, dim=1)
-                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
-                audio_idx += 1
-                video_idx += 1
-            # move to the next token
-            idx += len(new_src_item) - new_src_item_len
-
-        llm_positions = torch.cat(llm_pos_ids_list, dim=1)
-        mrope_position_delta = (
-            torch.cat(llm_pos_ids_list, dim=1).max() + 1 - len(src_item)
-        )
-        llm_positions = llm_positions[:, context_len:seq_len]
-
-        return llm_positions, mrope_position_delta
-
-    @staticmethod
-    def _get_llm_pos_ids_for_vision(
-        start_idx: int,
-        vision_idx: int,
-        spatial_merge_size: int,
-        t_index: list[int],
-        grid_hs: torch.Tensor,
-        grid_ws: torch.Tensor,
-    ) -> torch.Tensor:
-        llm_pos_ids_list = []
-        llm_grid_h = grid_hs[vision_idx] // spatial_merge_size
-        llm_grid_w = grid_ws[vision_idx] // spatial_merge_size
-        h_index = (
-            torch.arange(llm_grid_h)
-            .view(1, -1, 1)
-            .expand(len(t_index), -1, llm_grid_w)
-            .flatten()
-        )
-        w_index = (
-            torch.arange(llm_grid_w)
-            .view(1, 1, -1)
-            .expand(len(t_index), llm_grid_h, -1)
-            .flatten()
-        )
-        t_index_tensor = (
-            torch.Tensor(t_index)
-            .to(llm_grid_h.device)
-            .view(-1, 1)
-            .expand(-1, llm_grid_h * llm_grid_w)
-            .long()
-            .flatten()
-        )
-        _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index])
-        llm_pos_ids_list.append(_llm_pos_ids + start_idx)
-        llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
-        return llm_pos_ids
-
-    @staticmethod
-    def _split_list_into_ranges(lst: torch.Tensor, interval: int) -> list[list[int]]:
-        ranges: list[list[int]] = [[] for _ in range((max(lst) // interval) + 1)]
-        for num in lst:
-            index = num // interval
-            ranges[index].append(num)
-        return ranges
-
     @staticmethod
     def get_next_input_positions(
         mrope_position_delta: int,
@@ -1403,54 +439,3 @@ class MRotaryEmbedding(RotaryEmbedding):
             dtype=out.dtype,
         )
         out[:, out_offset : out_offset + num_new_tokens] = values
-
-    @classmethod
-    def omni_get_updates_use_audio_in_video(
-        cls,
-        thinker_config: PretrainedConfig,
-        audio_len: int,
-        video_grid_thw: Union[list[int], torch.Tensor],
-        video_second_per_grid_t: float,
-    ) -> list[int]:
-        """Get video prompt updates when `use_audio_in_video` is True.
-
-        In this case, audio and vision update ids will be split into
-        chunks and interleaved (details in `_omni_get_input_positions_tensor`).
-
-        <|video_bos|><|VIDEO|><|video_eos|> =>
-        <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|>
-        """
-
-        audio_token_id = thinker_config.audio_token_index
-        video_token_id = thinker_config.video_token_index
-        audio_start_token_id = thinker_config.audio_start_token_id
-        audio_end_token_id = thinker_config.audio_end_token_id
-        seconds_per_chunk = thinker_config.seconds_per_chunk
-        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
-        tokens_per_second = getattr(
-            thinker_config.vision_config, "tokens_per_second", 25
-        )
-
-        grid_t = video_grid_thw[0]
-        grid_h = video_grid_thw[1]
-        grid_w = video_grid_thw[2]
-        t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
-        t_index = torch.arange(grid_t) * video_second_per_grid_t * tokens_per_second
-        t_index_split_chunk = cls._split_list_into_ranges(t_index, t_ntoken_per_chunk)
-
-        updates = [audio_start_token_id]
-        added_audio_len = 0
-        for t_chunk in t_index_split_chunk:
-            vision_ntoken_per_chunk = (
-                len(t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
-            )
-            updates.extend([video_token_id] * vision_ntoken_per_chunk)
-
-            audio_chunk_size = min(t_ntoken_per_chunk, audio_len - added_audio_len)
-            updates.extend(audio_chunk_size * [audio_token_id])
-            added_audio_len += audio_chunk_size
-        if added_audio_len < audio_len:
-            updates.extend((audio_len - added_audio_len) * [audio_token_id])
-        updates.extend([audio_end_token_id])
-
-        return updates
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index 2579a0ebf53ef..d5b2caa2ddfd6 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -23,6 +23,7 @@
 # limitations under the License.
 """Inference-only Erine VL model compatible with HuggingFace weights."""
 
+import itertools
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
@@ -33,7 +34,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from transformers import BatchFeature
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.attention.backends.registry import _Backend
 from vllm.attention.layer import (
@@ -76,6 +77,7 @@ from .ernie45_vl_moe import Ernie4_5_VLMoeForCausalLM
 from .interfaces import (
     MultiModalEmbeddings,
     SupportsLoRA,
+    SupportsMRoPE,
     SupportsMultiModal,
     SupportsPP,
 )
@@ -1271,7 +1273,7 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing
     dummy_inputs=Ernie4_5_VLDummyInputsBuilder,
 )
 class Ernie4_5_VLMoeForConditionalGeneration(
-    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP
+    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
     merge_by_field_config = True
 
@@ -1388,6 +1390,151 @@ class Ernie4_5_VLMoeForConditionalGeneration(
         else:
             self.visual_token_mask = None
 
+    @classmethod
+    def get_mrope_input_positions(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        second_per_grid_ts: Optional[list[float]] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value for Ernie VL."""
+
+        image_token_id = hf_config.im_patch_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_conv_size = hf_config.spatial_conv_size
+        temporal_conv_size = hf_config.temporal_conv_size
+        llm_pos_ids_list: list = []
+
+        if not (image_grid_thw is None and video_grid_thw is None):
+            if isinstance(image_grid_thw, torch.Tensor):
+                image_grid_thw = image_grid_thw.tolist()
+
+            input_token_type: list[str] = []
+            video_check_flg = False
+            for token in input_tokens:
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if (token == image_token_id) and (video_check_flg is False):
+                    input_token_type.append("image")
+                elif (token == image_token_id) and (video_check_flg is True):
+                    input_token_type.append("video")
+                else:
+                    input_token_type.append("text")
+
+            input_type_group: list[tuple[str, int, int]] = []
+            for key, group_iter in itertools.groupby(
+                enumerate(input_token_type), lambda x: x[1]
+            ):
+                group_list = list(group_iter)
+                start_index = group_list[0][0]
+                end_index = group_list[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            video_frame_num = 1
+            mm_data_idx = 0
+            for modality_type, start_idx, end_idx in input_type_group:
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+                if modality_type == "image":
+                    t, h, w = (
+                        image_grid_thw[mm_data_idx][0],
+                        image_grid_thw[mm_data_idx][1],
+                        image_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t,
+                        h // spatial_conv_size,
+                        w // spatial_conv_size,
+                    )
+
+                    t_index = (
+                        torch.arange(llm_grid_t)
+                        .view(-1, 1)
+                        .expand(-1, llm_grid_h * llm_grid_w)
+                        .flatten()
+                    )
+                    h_index = (
+                        torch.arange(llm_grid_h)
+                        .view(1, -1, 1)
+                        .expand(llm_grid_t, -1, llm_grid_w)
+                        .flatten()
+                    )
+                    w_index = (
+                        torch.arange(llm_grid_w)
+                        .view(1, 1, -1)
+                        .expand(llm_grid_t, llm_grid_h, -1)
+                        .flatten()
+                    )
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx
+                    )
+                    mm_data_idx += 1
+
+                elif modality_type == "video":
+                    t, h, w = (
+                        video_grid_thw[mm_data_idx][0],
+                        video_grid_thw[mm_data_idx][1],
+                        video_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t // temporal_conv_size,
+                        h // spatial_conv_size,
+                        w // spatial_conv_size,
+                    )
+
+                    for t_idx in range(llm_grid_t):
+                        t_index = (
+                            torch.tensor(t_idx)
+                            .view(-1, 1)
+                            .expand(-1, llm_grid_h * llm_grid_w)
+                            .flatten()
+                        )
+                        h_index = (
+                            torch.arange(llm_grid_h)
+                            .view(1, -1, 1)
+                            .expand(1, -1, llm_grid_w)
+                            .flatten()
+                        )
+                        w_index = (
+                            torch.arange(llm_grid_w)
+                            .view(1, 1, -1)
+                            .expand(1, llm_grid_h, -1)
+                            .flatten()
+                        )
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx
+                        )
+
+                    mm_data_idx += 1
+                    video_frame_num += 1
+
+                else:
+                    text_len = end_idx - start_idx
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                    video_frame_num = 1
+
+        else:
+            text_len = len(input_tokens)
+            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = llm_positions[:, context_len:seq_len]
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+        return llm_positions, mrope_position_delta
+
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index a5c3ce0e6bf74..63731b2947d2d 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -5,6 +5,7 @@
 # https://github.com/zai-org/CogAgent
 """Inference-only CogAgent model compatible with THUDM weights."""
 
+import itertools
 from argparse import Namespace
 from collections.abc import Mapping, Sequence
 from typing import Annotated, Literal, Optional, Union
@@ -14,7 +15,7 @@ from torch import nn
 from torch.nn import LayerNorm
 from torchvision import transforms
 from torchvision.transforms import InterpolationMode
-from transformers import BatchFeature, PreTrainedTokenizer, TensorType
+from transformers import BatchFeature, PretrainedConfig, PreTrainedTokenizer, TensorType
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
@@ -54,6 +55,7 @@ from .chatglm import ChatGLMBaseModel, ChatGLMModel
 from .interfaces import (
     MultiModalEmbeddings,
     SupportsLoRA,
+    SupportsMRoPE,
     SupportsMultiModal,
     SupportsPP,
 )
@@ -554,7 +556,9 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
     info=GLM4VProcessingInfo,
     dummy_inputs=GLM4VDummyInputsBuilder,
 )
-class GLM4VForCausalLM(ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP):
+class GLM4VForCausalLM(
+    ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
+):
     merge_by_field_config = True
 
     packed_modules_mapping = {
@@ -615,6 +619,150 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, Suppo
 
         return self.transformer.vision(pixel_values)
 
+    @classmethod
+    def get_mrope_input_positions(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        second_per_grid_ts: Optional[list[float]] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value for GLM4V."""
+
+        image_token_id = hf_config.image_token_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        llm_pos_ids_list: list = []
+
+        if not (image_grid_thw is None and video_grid_thw is None):
+            if isinstance(image_grid_thw, torch.Tensor):
+                image_grid_thw = image_grid_thw.tolist()
+
+            input_token_type: list[str] = []
+            video_check_flg = False
+            for token in input_tokens:
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if (token == image_token_id) and (video_check_flg is False):
+                    input_token_type.append("image")
+                elif (token == image_token_id) and (video_check_flg is True):
+                    input_token_type.append("video")
+                else:
+                    input_token_type.append("text")
+
+            input_type_group: list[tuple[str, int, int]] = []
+            for key, group_iter in itertools.groupby(
+                enumerate(input_token_type), lambda x: x[1]
+            ):
+                group_list = list(group_iter)
+                start_index = group_list[0][0]
+                end_index = group_list[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            video_frame_num = 1
+            mm_data_idx = 0
+            for modality_type, start_idx, end_idx in input_type_group:
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+                if modality_type == "image":
+                    t, h, w = (
+                        image_grid_thw[mm_data_idx][0],
+                        image_grid_thw[mm_data_idx][1],
+                        image_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t,
+                        h // spatial_merge_size,
+                        w // spatial_merge_size,
+                    )
+
+                    t_index = (
+                        torch.arange(llm_grid_t)
+                        .view(-1, 1)
+                        .expand(-1, llm_grid_h * llm_grid_w)
+                        .flatten()
+                    )
+                    h_index = (
+                        torch.arange(llm_grid_h)
+                        .view(1, -1, 1)
+                        .expand(llm_grid_t, -1, llm_grid_w)
+                        .flatten()
+                    )
+                    w_index = (
+                        torch.arange(llm_grid_w)
+                        .view(1, 1, -1)
+                        .expand(llm_grid_t, llm_grid_h, -1)
+                        .flatten()
+                    )
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx
+                    )
+                    mm_data_idx += 1
+
+                elif modality_type == "video":
+                    t, h, w = (
+                        video_frame_num,
+                        image_grid_thw[mm_data_idx][1],
+                        image_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t,
+                        h // spatial_merge_size,
+                        w // spatial_merge_size,
+                    )
+
+                    for t_idx in range(llm_grid_t):
+                        t_index = (
+                            torch.tensor(t_idx)
+                            .view(-1, 1)
+                            .expand(-1, llm_grid_h * llm_grid_w)
+                            .flatten()
+                        )
+                        h_index = (
+                            torch.arange(llm_grid_h)
+                            .view(1, -1, 1)
+                            .expand(1, -1, llm_grid_w)
+                            .flatten()
+                        )
+                        w_index = (
+                            torch.arange(llm_grid_w)
+                            .view(1, 1, -1)
+                            .expand(1, llm_grid_h, -1)
+                            .flatten()
+                        )
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx
+                        )
+
+                    mm_data_idx += 1
+                    video_frame_num += 1
+
+                else:
+                    text_len = end_idx - start_idx
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                    video_frame_num = 1
+
+        else:
+            text_len = len(input_tokens)
+            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1))
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = llm_positions[:, context_len:seq_len]
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+        return llm_positions, mrope_position_delta
+
     def get_language_model(self) -> torch.nn.Module:
         return self.transformer
 
diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py
index 578436fcad219..21d8099b43d16 100644
--- a/vllm/model_executor/models/keye_vl1_5.py
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -38,7 +38,7 @@ from vllm.multimodal.processing import (
 )
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .interfaces import SupportsLoRA, SupportsMRoPE, SupportsMultiModal, SupportsPP
 from .keye import (
     BaseKeyeModule,
     BaseMultiModalProcessor,
@@ -493,7 +493,7 @@ class KeyeVL1_5DummyInputsBuilder(
     dummy_inputs=KeyeVL1_5DummyInputsBuilder,
 )
 class KeyeVL1_5ForConditionalGeneration(
-    BaseKeyeModule, SupportsMultiModal, SupportsLoRA, SupportsPP
+    BaseKeyeModule, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
     def _build_projector(
         self,
@@ -589,3 +589,143 @@ class KeyeVL1_5ForConditionalGeneration(
             end = patch_cu_seqlens[idx + 1]
             new_video_embeds.append(video_embeds[start:end])
         return tuple(new_video_embeds)
+
+    @classmethod
+    def get_mrope_input_positions(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        second_per_grid_ts: Optional[list[float]] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
+            video_grid_thw = video_grid_thw[0]
+        """Get mrope input positions and delta value (Keye series)."""
+
+        def split_thw(grid_thw: Union[torch.Tensor, list[int]]) -> list[list[int]]:
+            """
+            Split grid_thw along the t dimension.
+
+            Args:
+                grid_thw: shape [N, 3] tensor or nested list of [t, h, w].
+
+            Returns:
+                List of [1, h, w] rows, repeated t times for each original row.
+            """
+
+            if isinstance(grid_thw, list):
+                grid_thw = torch.tensor(grid_thw, dtype=torch.long)
+
+            if grid_thw.numel() == 0:
+                return []
+
+            t, hw = grid_thw[:, 0], grid_thw[:, 1:]
+            ones = torch.ones_like(hw[:, :1])  # [N,1]
+            out = torch.cat([ones, hw], dim=1).repeat_interleave(t, dim=0)
+            return out.tolist()
+
+        video_grid_thw = split_thw(video_grid_thw)
+
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+
+        image_nums = len(image_grid_thw)
+        frame_nums = len(video_grid_thw)
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_frames = image_nums, frame_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + frame_nums):
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
+            else:
+                ed_image = len(input_tokens) + 1
+            if remain_frames > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
+            else:
+                ed_video = len(input_tokens) + 1
+
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                video_index += 1
+                remain_frames -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+            t_index = (
+                (
+                    torch.arange(llm_grid_t)
+                    .view(-1, 1)
+                    .expand(-1, llm_grid_h * llm_grid_w)
+                )
+                .long()
+                .flatten()
+            )
+
+            h_index = (
+                torch.arange(llm_grid_h)
+                .view(1, -1, 1)
+                .expand(llm_grid_t, -1, llm_grid_w)
+                .flatten()
+            )
+            w_index = (
+                torch.arange(llm_grid_w)
+                .view(1, 1, -1)
+                .expand(llm_grid_t, llm_grid_h, -1)
+                .flatten()
+            )
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+            )
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 1ab2f43c9d736..0df79fc733f3f 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -29,6 +29,7 @@ from typing import Annotated, Any, Callable, Literal, Optional, Union
 
 import torch
 import torch.nn as nn
+from transformers import PretrainedConfig
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
     Qwen2_5OmniConfig,
@@ -45,7 +46,6 @@ from transformers.models.whisper import WhisperFeatureExtractor
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
-from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2_5_vl import (
     Qwen2_5_VisionTransformer,
@@ -93,6 +93,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from .interfaces import (
     MultiModalEmbeddings,
     SupportsLoRA,
+    SupportsMRoPE,
     SupportsMultiModal,
     SupportsPP,
 )
@@ -101,7 +102,9 @@ from .utils import (
     WeightsMapper,
     init_vllm_registered_model,
     maybe_prefix,
+    split_list_into_ranges,
 )
+from .vision import get_llm_pos_ids_for_vision
 
 try:
     import flash_attn
@@ -412,6 +415,59 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
 
         return prompt_ids, mm_placeholders
 
+    @classmethod
+    def omni_get_updates_use_audio_in_video(
+        cls,
+        thinker_config: PretrainedConfig,
+        audio_len: int,
+        video_grid_thw: Union[list[int], torch.Tensor],
+        video_second_per_grid_t: float,
+    ) -> list[int]:
+        """Get video prompt updates when `use_audio_in_video` is True.
+
+        In this case, audio and vision update ids will be split into
+        chunks and interleaved (details in `_omni_get_input_positions_tensor`).
+
+        <|video_bos|><|VIDEO|><|video_eos|> =>
+        <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|>
+        """
+
+        audio_token_id = thinker_config.audio_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(
+            thinker_config.vision_config, "tokens_per_second", 25
+        )
+
+        grid_t = video_grid_thw[0]
+        grid_h = video_grid_thw[1]
+        grid_w = video_grid_thw[2]
+        t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+        t_index = (
+            torch.arange(grid_t) * video_second_per_grid_t * tokens_per_second
+        ).long()
+        t_index_split_chunk = split_list_into_ranges(t_index, t_ntoken_per_chunk)
+
+        updates = [audio_start_token_id]
+        added_audio_len = 0
+        for t_chunk in t_index_split_chunk:
+            vision_ntoken_per_chunk = (
+                len(t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
+            )
+            updates.extend([video_token_id] * vision_ntoken_per_chunk)
+
+            audio_chunk_size = min(t_ntoken_per_chunk, audio_len - added_audio_len)
+            updates.extend(audio_chunk_size * [audio_token_id])
+            added_audio_len += audio_chunk_size
+        if added_audio_len < audio_len:
+            updates.extend((audio_len - added_audio_len) * [audio_token_id])
+        updates.extend([audio_end_token_id])
+
+        return updates
+
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
@@ -491,7 +547,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
             else:
                 video_second_per_grid_t = 1.0
 
-            return MRotaryEmbedding.omni_get_updates_use_audio_in_video(
+            return self.omni_get_updates_use_audio_in_video(
                 thinker_config=thinker_config,
                 audio_len=audio_num_features,
                 video_grid_thw=video_grid_thw,
@@ -808,6 +864,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     SupportsMultiModal,
     SupportsPP,
     SupportsLoRA,
+    SupportsMRoPE,
     Qwen2_5OmniConditionalGenerationMixin,
 ):
     hf_to_vllm_mapper = WeightsMapper(
@@ -929,6 +986,216 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
+    @classmethod
+    def get_mrope_input_positions(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        second_per_grid_ts: Optional[list[float]] = None,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value (Qwen2.5-Omni version).
+
+        Differences from MRotaryEmbedding:
+            1. Add audio support (and related `audio_feature_lengths`).
+            2. Add `use_audio_in_video` option to read audio from video inputs.
+                In this case, audio and vision position ids will be split into
+                chunks and interleaved.
+
+        Example:
+
+            (V_i are vision position ids, A_i are audio position ids)
+
+            |V_1 ...    V_n|A_1 ...   A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
+            |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
+        """
+
+        # TODO(fyabc): refactor and share more code with
+        #  _vl_get_input_positions_tensor.
+
+        thinker_config = hf_config.thinker_config
+        audio_token_id = thinker_config.audio_token_index
+        image_token_id = thinker_config.image_token_index
+        video_token_id = thinker_config.video_token_index
+        audio_start_token_id = thinker_config.audio_start_token_id
+        audio_end_token_id = thinker_config.audio_end_token_id
+        vision_start_token_id = thinker_config.vision_start_token_id
+        vision_end_token_id = thinker_config.vision_end_token_id
+        seconds_per_chunk = thinker_config.seconds_per_chunk
+        spatial_merge_size = thinker_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(
+            thinker_config.vision_config, "tokens_per_second", 25
+        )
+
+        if isinstance(image_grid_thw, list):
+            image_grid_thw = torch.tensor(image_grid_thw)
+        if isinstance(video_grid_thw, list):
+            video_grid_thw = torch.tensor(video_grid_thw)
+
+        src_item = input_tokens
+        audio_seqlens = audio_feature_lengths
+        if not second_per_grid_ts:
+            second_per_grid_ts = [1] * video_grid_thw.shape[0]
+        audio_idx = 0
+        video_idx = 0
+        image_idx = 0
+        new_src_item: list[int] = []
+        llm_pos_ids_list: list[torch.Tensor] = []
+
+        idx = 0
+        while idx < len(src_item):
+            new_src_item_len = len(new_src_item)
+            start_idx = (
+                llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            )
+            if src_item[idx] not in [audio_token_id, video_token_id, image_token_id]:
+                if use_audio_in_video and idx > 0:
+                    if (
+                        src_item[idx] == vision_end_token_id
+                        and src_item[idx - 1] == audio_end_token_id
+                    ):
+                        # processing the <|audio_eos|> before <|vision_eos|>
+                        start_idx -= 1
+                    elif (
+                        src_item[idx] == audio_start_token_id
+                        and src_item[idx - 1] == vision_start_token_id
+                    ):
+                        # processing the <|audio_bos|> after <|vision_eos|>
+                        start_idx -= 1
+                new_src_item.append(src_item[idx])
+                llm_pos_ids = torch.tensor([start_idx], dtype=torch.long).expand(3, -1)
+                llm_pos_ids_list.append(llm_pos_ids)
+            elif src_item[idx] == audio_token_id:
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                place_num = ((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1
+                new_src_item.extend([audio_token_id] * place_num)
+                llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx
+                llm_pos_ids_list.append(llm_pos_ids)
+                audio_idx += 1
+            elif src_item[idx] == image_token_id:
+                grid_t = image_grid_thw[image_idx][0]
+                grid_hs = image_grid_thw[:, 1]
+                grid_ws = image_grid_thw[:, 2]
+                t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long()
+                llm_pos_ids = get_llm_pos_ids_for_vision(
+                    start_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = image_grid_thw[image_idx].prod() // (
+                    spatial_merge_size**2
+                )
+                new_src_item.extend([image_token_id] * vision_seqlen)
+                image_idx += 1
+            elif src_item[idx] == video_token_id and not use_audio_in_video:
+                grid_t = video_grid_thw[video_idx][0]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_index = (
+                    torch.arange(grid_t)
+                    * second_per_grid_ts[video_idx]
+                    * tokens_per_second
+                ).long()
+                llm_pos_ids = get_llm_pos_ids_for_vision(
+                    start_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws
+                )
+                llm_pos_ids_list.append(llm_pos_ids)
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2
+                )
+                new_src_item.extend([video_token_id] * vision_seqlen)
+                video_idx += 1
+            else:
+                # read audio from video
+                assert audio_seqlens is not None
+                audio_seqlen = audio_seqlens[audio_idx]
+                vision_seqlen = video_grid_thw[video_idx].prod() // (
+                    spatial_merge_size**2
+                )
+                grid_t = video_grid_thw[video_idx][0]
+                grid_h = video_grid_thw[video_idx][1]
+                grid_w = video_grid_thw[video_idx][2]
+                grid_hs = video_grid_thw[:, 1]
+                grid_ws = video_grid_thw[:, 2]
+                t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk)
+                t_index = (
+                    torch.arange(grid_t)
+                    * second_per_grid_ts[video_idx]
+                    * tokens_per_second
+                ).long()
+                t_index_split_chunk = split_list_into_ranges(
+                    t_index, t_ntoken_per_chunk
+                )
+                place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2
+                pure_audio_len = place_num - 2
+                added_audio_len = 0
+                audio_llm_pos_ids_list: list[torch.Tensor] = []
+                for t_chunk in t_index_split_chunk:
+                    vision_ntoken_per_chunk = (
+                        len(t_chunk) * grid_h * grid_w // (spatial_merge_size**2)
+                    )
+                    new_src_item.extend([video_token_id] * vision_ntoken_per_chunk)
+                    vision_llm_pos_ids_list = get_llm_pos_ids_for_vision(
+                        start_idx,
+                        video_idx,
+                        spatial_merge_size,
+                        t_chunk,
+                        grid_hs,
+                        grid_ws,
+                    ).split(1, dim=1)
+                    llm_pos_ids_list.extend(vision_llm_pos_ids_list)
+                    new_src_item.extend(
+                        min(t_ntoken_per_chunk, pure_audio_len - added_audio_len)
+                        * [audio_token_id]
+                    )
+                    audio_start_idx = (
+                        start_idx
+                        if len(audio_llm_pos_ids_list) == 0
+                        else audio_llm_pos_ids_list[-1][0].item() + 1
+                    )
+                    if min(t_ntoken_per_chunk, pure_audio_len - added_audio_len) > 0:
+                        audio_llm_pos_ids_list = (
+                            torch.arange(
+                                min(
+                                    t_ntoken_per_chunk, pure_audio_len - added_audio_len
+                                )
+                            ).expand(3, -1)
+                            + audio_start_idx
+                        ).split(1, dim=1)
+                    else:
+                        audio_llm_pos_ids_list = []
+                    added_audio_len += min(
+                        t_ntoken_per_chunk, pure_audio_len - added_audio_len
+                    )
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                if added_audio_len < pure_audio_len:
+                    new_src_item.extend(
+                        (pure_audio_len - added_audio_len) * [audio_token_id]
+                    )
+                    audio_llm_pos_ids_list = (
+                        torch.arange(pure_audio_len - added_audio_len).expand(3, -1)
+                        + llm_pos_ids_list[-1].max()
+                        + 1
+                    ).split(1, dim=1)
+                    llm_pos_ids_list.extend(audio_llm_pos_ids_list)
+                audio_idx += 1
+                video_idx += 1
+            # move to the next token
+            idx += len(new_src_item) - new_src_item_len
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1)
+        mrope_position_delta = (
+            torch.cat(llm_pos_ids_list, dim=1).max() + 1 - len(src_item)
+        )
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
     def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 9cd83f61d9213..094fd90aac4e5 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -34,7 +34,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from transformers import BatchFeature
+from transformers import BatchFeature, PretrainedConfig
 from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
 from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
     Qwen2_5_VLConfig,
@@ -79,6 +79,7 @@ from .interfaces import (
     MultiModalEmbeddings,
     SupportsEagle3,
     SupportsLoRA,
+    SupportsMRoPE,
     SupportsMultiModal,
     SupportsMultiModalPruning,
     SupportsPP,
@@ -1053,6 +1054,7 @@ class Qwen2_5_VLForConditionalGeneration(
     SupportsQuant,
     SupportsEagle3,
     SupportsMultiModalPruning,
+    SupportsMRoPE,
 ):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
@@ -1073,6 +1075,132 @@ class Qwen2_5_VLForConditionalGeneration(
 
     supports_encoder_tp_data = True
 
+    @classmethod
+    def get_mrope_input_positions(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        second_per_grid_ts: list[float],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value."""
+
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        vision_start_token_id = hf_config.vision_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(hf_config.vision_config, "tokens_per_second", 1.0)
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        vision_start_indices = torch.argwhere(
+            input_tokens_tensor == vision_start_token_id
+        ).squeeze(1)
+        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + video_nums):
+            video_second_per_grid_t = 0.0
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
+            else:
+                ed_image = len(input_tokens) + 1
+            if remain_videos > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                video_second_per_grid_t = 1.0
+                if second_per_grid_ts:
+                    video_second_per_grid_t = second_per_grid_ts[video_index]
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+            t_index = (
+                (
+                    torch.arange(llm_grid_t)
+                    .view(-1, 1)
+                    .expand(-1, llm_grid_h * llm_grid_w)
+                    * video_second_per_grid_t
+                    * tokens_per_second
+                )
+                .long()
+                .flatten()
+            )
+
+            h_index = (
+                torch.arange(llm_grid_h)
+                .view(1, -1, 1)
+                .expand(llm_grid_t, -1, llm_grid_w)
+                .flatten()
+            )
+            w_index = (
+                torch.arange(llm_grid_w)
+                .view(1, 1, -1)
+                .expand(llm_grid_t, llm_grid_h, -1)
+                .flatten()
+            )
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+            )
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 8862e88bd531f..1e6c3485c4d60 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -33,7 +33,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import BatchFeature
+from transformers import BatchFeature, PretrainedConfig
 from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
     smart_resize as image_smart_resize,
@@ -84,6 +84,7 @@ from vllm.utils import is_list_of
 from .interfaces import (
     MultiModalEmbeddings,
     SupportsLoRA,
+    SupportsMRoPE,
     SupportsMultiModal,
     SupportsPP,
 )
@@ -1174,7 +1175,7 @@ class Qwen3LLMForCausalLM(Qwen3ForCausalLM):
     dummy_inputs=Qwen3VLDummyInputsBuilder,
 )
 class Qwen3VLForConditionalGeneration(
-    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP
+    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
     packed_modules_mapping = {
         "qkv_proj": [
@@ -1480,6 +1481,116 @@ class Qwen3VLForConditionalGeneration(
                 )
         return mm_input_by_modality
 
+    @classmethod
+    def get_mrope_input_positions(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        second_per_grid_ts: Optional[list[float]] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value."""
+
+        video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)]
+
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        vision_start_token_id = hf_config.vision_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        vision_start_indices = torch.argwhere(
+            input_tokens_tensor == vision_start_token_id
+        ).squeeze(1)
+        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + video_nums):
+            if image_token_id in input_tokens and remain_images > 0:
+                ed_image = input_tokens.index(image_token_id, st)
+            else:
+                ed_image = len(input_tokens) + 1
+            if video_token_id in input_tokens and remain_videos > 0:
+                ed_video = input_tokens.index(video_token_id, st)
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+            t_index = (
+                torch.arange(llm_grid_t)
+                .view(-1, 1)
+                .expand(-1, llm_grid_h * llm_grid_w)
+                .flatten()
+            )
+            h_index = (
+                torch.arange(llm_grid_h)
+                .view(1, -1, 1)
+                .expand(llm_grid_t, -1, llm_grid_w)
+                .flatten()
+            )
+            w_index = (
+                torch.arange(llm_grid_w)
+                .view(1, 1, -1)
+                .expand(llm_grid_t, llm_grid_h, -1)
+                .flatten()
+            )
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+            )
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+            )
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
+        llm_positions = llm_positions[:, context_len:seq_len]
+        return llm_positions, mrope_position_delta
+
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 2a64f6865f12a..bd530be73c2ad 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -410,6 +410,14 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str:
     return " + ".join(_embedding_count_expression(inner) for inner in embeddings)
 
 
+def split_list_into_ranges(lst: torch.Tensor, interval: int) -> list[list[int]]:
+    ranges: list[list[int]] = [[] for _ in range((max(lst) // interval) + 1)]
+    for num in lst:
+        index = num // interval
+        ranges[index].append(num)
+    return ranges
+
+
 def _merge_multimodal_embeddings(
     inputs_embeds: torch.Tensor,
     multimodal_embeddings: NestedTensors,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a323835e575cc..ec824f6d6bf5e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -875,30 +875,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             if mm_input.get("use_audio_in_video") is True:
                 use_audio_in_video = True
 
-        if supports_mrope(self.get_model()):
-            req_state.mrope_positions, req_state.mrope_position_delta = (
-                self.model.get_mrope_input_positions(
-                    req_state.prompt_token_ids,
-                    hf_config=self.model_config.hf_config,
-                    image_grid_thw=image_grid_thw,
-                    video_grid_thw=video_grid_thw,
-                    second_per_grid_ts=second_per_grid_ts,
-                    audio_feature_lengths=audio_feature_lengths,
-                    use_audio_in_video=use_audio_in_video,
-                )
-            )
-        else:
-            req_state.mrope_positions, req_state.mrope_position_delta = (
-                MRotaryEmbedding.get_input_positions_tensor(
-                    req_state.prompt_token_ids,
-                    hf_config=self.model_config.hf_config,
-                    image_grid_thw=image_grid_thw,
-                    video_grid_thw=video_grid_thw,
-                    second_per_grid_ts=second_per_grid_ts,
-                    audio_feature_lengths=audio_feature_lengths,
-                    use_audio_in_video=use_audio_in_video,
-                )
+        assert supports_mrope(self.get_model()), "M-RoPE support is not implemented."
+
+        req_state.mrope_positions, req_state.mrope_position_delta = (
+            self.model.get_mrope_input_positions(
+                req_state.prompt_token_ids,
+                hf_config=self.model_config.hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                audio_feature_lengths=audio_feature_lengths,
+                use_audio_in_video=use_audio_in_video,
             )
+        )
 
     def _extract_mm_kwargs(
         self,
@@ -2900,7 +2889,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 logger.info("Loading drafter model...")
                 self.drafter.load_model(self.model)
             if self.use_aux_hidden_state_outputs:
-                if not supports_eagle3(self.model):
+                if not supports_eagle3(self.get_model()):
                     raise RuntimeError(
                         "Model does not support EAGLE3 interface but "
                         "aux_hidden_state_outputs was requested"
@@ -2928,7 +2917,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         prepare_communication_buffer_for_model(self.model)
 
         self.is_multimodal_pruning_enabled = (
-            supports_multimodal_pruning(self.model)
+            supports_multimodal_pruning(self.get_model())
             and self.model_config.multimodal_config.is_multimodal_pruning_enabled()
         )
 

From 086609de64456bfcfa44e1f9236a940122156b02 Mon Sep 17 00:00:00 2001
From: ihb2032 <40718643+ihb2032@users.noreply.github.com>
Date: Sat, 11 Oct 2025 17:12:16 +0800
Subject: [PATCH 07/30] fix(nix): Allow local oneDNN path to fix vLLM CPU build
 failure (#26401)

Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
---
 cmake/cpu_extension.cmake | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index a6e53588f4f0f..9bac5ea41c8d4 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -198,13 +198,24 @@ else()
 endif()
 
 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
-    FetchContent_Declare(
-        oneDNN
-        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG v3.9
-        GIT_PROGRESS TRUE
-        GIT_SHALLOW TRUE
-    )
+    set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
+
+    if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
+        message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}")
+        FetchContent_Declare(
+            oneDNN
+            SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN}
+        )
+    else()
+        message(STATUS "Downloading oneDNN from GitHub")
+        FetchContent_Declare(
+            oneDNN
+            GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+            GIT_TAG v3.9
+            GIT_PROGRESS TRUE
+            GIT_SHALLOW TRUE
+        )
+    endif()
 
     if(USE_ACL)
         find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
@@ -227,7 +238,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
     set(ONEDNN_ENABLE_ITT_TASKS "OFF")
     set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
     set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
-    set(ONEDNN_VERBOSE "ON")
+    set(ONEDNN_VERBOSE "OFF")
     set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 
     FetchContent_MakeAvailable(oneDNN)

From d2a71530c159c361f991a1ed986e64209651cc92 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rtuli@redhat.com>
Date: Sat, 11 Oct 2025 15:44:41 +0530
Subject: [PATCH 08/30] Add EAGLE-3 Speculative Decoding Support for Qwen3 MoE
 (#26485)

Signed-off-by: Rahul Tuli <rtuli@redhat.com>
---
 vllm/model_executor/models/qwen3_moe.py | 37 ++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 34b5af846493a..825272535a450 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -64,7 +64,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 from vllm.model_executor.models.utils import sequence_parallel_chunk
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
+from .interfaces import MixtureOfExperts, SupportsEagle3, SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
@@ -422,6 +422,8 @@ class Qwen3MoeModel(nn.Module):
         self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
             ["hidden_states", "residual"], config.hidden_size
         )
+        # Track layers for auxiliary hidden state outputs (EAGLE3)
+        self.aux_hidden_state_layers: tuple[int, ...] = ()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
@@ -432,7 +434,9 @@ class Qwen3MoeModel(nn.Module):
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> Union[
+        torch.Tensor, IntermediateTensors, tuple[torch.Tensor, list[torch.Tensor]]
+    ]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -443,13 +447,29 @@ class Qwen3MoeModel(nn.Module):
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in islice(self.layers, self.start_layer, self.end_layer):
+
+        aux_hidden_states = []
+        for layer_idx, layer in enumerate(
+            islice(self.layers, self.start_layer, self.end_layer),
+            start=self.start_layer,
+        ):
+            # Collect auxiliary hidden states if specified
+            if layer_idx in self.aux_hidden_state_layers:
+                aux_hidden_state = (
+                    hidden_states + residual if residual is not None else hidden_states
+                )
+                aux_hidden_states.append(aux_hidden_state)
             hidden_states, residual = layer(positions, hidden_states, residual)
+
         if not get_pp_group().is_last_rank:
             return IntermediateTensors(
                 {"hidden_states": hidden_states, "residual": residual}
             )
         hidden_states, _ = self.norm(hidden_states, residual)
+
+        # Return auxiliary hidden states if collected
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
         return hidden_states
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
@@ -606,7 +626,9 @@ class Qwen3MoeModel(nn.Module):
         return loaded_params
 
 
-class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExperts):
+class Qwen3MoeForCausalLM(
+    nn.Module, SupportsPP, SupportsLoRA, SupportsEagle3, MixtureOfExperts
+):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -702,6 +724,13 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExperts)
                 moe.n_redundant_experts = self.num_redundant_experts
                 moe.experts.update_expert_map()
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 

From f7ee69868a26a7f1ce3e3c0086b6bcbee266e204 Mon Sep 17 00:00:00 2001
From: muzian666 <94029822+muzian666@users.noreply.github.com>
Date: Sat, 11 Oct 2025 20:04:04 +0800
Subject: [PATCH 09/30] [CPU] fix the issue when the node is '-' cause json
 decode error. (#26562)

Signed-off-by: muzian666 <andylee_2001@163.com>
Co-authored-by: qingan.li <qingan.li@wizpresso.com>
---
 vllm/platforms/cpu.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 2f87664003dcd..49c953fd36ee0 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -4,6 +4,7 @@
 import json
 import os
 import platform
+import re
 import subprocess
 import sys
 from dataclasses import dataclass
@@ -336,6 +337,7 @@ class CpuPlatform(Platform):
         lscpu_output = subprocess.check_output(
             "lscpu -J -e=CPU,CORE,NODE", shell=True, text=True
         )
+        lscpu_output = re.sub(r'"node":\s*-\s*(,|\n)', r'"node": 0\1', lscpu_output)
         logical_cpu_list: list[LogicalCPUInfo] = json.loads(
             lscpu_output, object_hook=LogicalCPUInfo.json_decoder
         )["cpus"]

From d0bed837ac83aa33f6540d32cbcc75525328078e Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sat, 11 Oct 2025 20:04:49 +0800
Subject: [PATCH 10/30] [Refactor]Reduce duplicate code in serving_chat
 (#26627)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py      | 44 ++++--------------
 vllm/entrypoints/openai/serving_engine.py    | 47 +++++++++++++++++++-
 vllm/entrypoints/openai/serving_responses.py | 14 +-----
 3 files changed, 56 insertions(+), 49 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 51c618e9d51d7..94c24ce9b307a 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -6,7 +6,7 @@ import json
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
-from typing import Callable, Final, Optional, Union
+from typing import Final, Optional, Union
 
 import jinja2
 import partial_json_parser
@@ -56,14 +56,13 @@ from vllm.entrypoints.openai.protocol import (
 )
 from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
 from vllm.entrypoints.utils import get_max_tokens
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import CompletionOutput, RequestOutput
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.transformers_utils.tokenizers import (
@@ -112,42 +111,15 @@ class OpenAIServingChat(OpenAIServing):
         self.trust_request_chat_template = trust_request_chat_template
         self.enable_log_outputs = enable_log_outputs
 
+        # set up reasoning parser
+        self.reasoning_parser = self._get_reasoning_parser(
+            reasoning_parser_name=reasoning_parser
+        )
         # set up tool use
         self.enable_auto_tools: bool = enable_auto_tools
-        if self.enable_auto_tools:
-            logger.info(
-                '"auto" tool choice has been enabled please note that while'
-                " the parallel_tool_calls client option is preset for "
-                "compatibility reasons, it will be ignored."
-            )
-
-        self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = (
-            None
+        self.tool_parser = self._get_tool_parser(
+            tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools
         )
-        if reasoning_parser:
-            try:
-                self.reasoning_parser = ReasoningParserManager.get_reasoning_parser(
-                    reasoning_parser
-                )
-                assert self.reasoning_parser is not None
-            except Exception as e:
-                raise TypeError(f"{reasoning_parser=} has not been registered") from e
-        self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
-        if self.enable_auto_tools:
-            try:
-                if tool_parser == "pythonic" and self.model_config.model.startswith(
-                    "meta-llama/Llama-3.2"
-                ):
-                    logger.warning(
-                        "Llama3.2 models may struggle to emit valid pythonic tool calls"
-                    )
-                self.tool_parser = ToolParserManager.get_tool_parser(tool_parser)
-            except Exception as e:
-                raise TypeError(
-                    "Error: --enable-auto-tool-choice requires "
-                    f"tool_parser:'{tool_parser}' which has not "
-                    "been registered"
-                ) from e
         self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none
 
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index edb8ecc94382a..0d1a525c6d3da 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -63,7 +63,7 @@ from vllm.entrypoints.openai.protocol import (
     TranslationRequest,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.entrypoints.openai.tool_parsers import ToolParser
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs.data import PromptType
@@ -82,6 +82,7 @@ from vllm.multimodal import (  # noqa: F401 - Required to resolve Pydantic error
 )
 from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.tracing import (
     contains_trace_headers,
@@ -274,6 +275,50 @@ class OpenAIServing:
         self.model_config = self.models.model_config
         self.max_model_len = self.model_config.max_model_len
 
+    def _get_tool_parser(
+        self, tool_parser_name: Optional[str] = None, enable_auto_tools: bool = False
+    ) -> Optional[Callable[[AnyTokenizer], ToolParser]]:
+        """Get the tool parser based on the name."""
+        parser = None
+        if not enable_auto_tools or tool_parser_name is None:
+            return parser
+        logger.info(
+            '"auto" tool choice has been enabled please note that while'
+            " the parallel_tool_calls client option is preset for "
+            "compatibility reasons, it will be ignored."
+        )
+
+        try:
+            if tool_parser_name == "pythonic" and self.model_config.model.startswith(
+                "meta-llama/Llama-3.2"
+            ):
+                logger.warning(
+                    "Llama3.2 models may struggle to emit valid pythonic tool calls"
+                )
+            parser = ToolParserManager.get_tool_parser(tool_parser_name)
+        except Exception as e:
+            raise TypeError(
+                "Error: --enable-auto-tool-choice requires "
+                f"tool_parser:'{tool_parser_name}' which has not "
+                "been registered"
+            ) from e
+        return parser
+
+    def _get_reasoning_parser(
+        self,
+        reasoning_parser_name: str,
+    ) -> Optional[Callable[[AnyTokenizer], ReasoningParser]]:
+        """Get the reasoning parser based on the name."""
+        parser = None
+        if not reasoning_parser_name:
+            return None
+        try:
+            parser = ReasoningParserManager.get_reasoning_parser(reasoning_parser_name)
+            assert parser is not None
+        except Exception as e:
+            raise TypeError(f"{reasoning_parser_name=} has not been registered") from e
+        return parser
+
     async def reset_mm_cache(self) -> None:
         self.processor.clear_mm_cache()
         await self.engine_client.reset_mm_cache()
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 48c5222bccc95..60f8b78ed1757 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -96,7 +96,6 @@ from vllm.logger import init_logger
 from vllm.logprobs import Logprob as SampleLogprob
 from vllm.logprobs import SampleLogprobs
 from vllm.outputs import CompletionOutput
-from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
@@ -136,18 +135,9 @@ class OpenAIServingResponses(OpenAIServing):
         self.chat_template_content_format: Final = chat_template_content_format
         self.enable_log_outputs = enable_log_outputs
 
-        self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = (
-            None
+        self.reasoning_parser = self._get_reasoning_parser(
+            reasoning_parser_name=reasoning_parser
         )
-        if reasoning_parser:
-            try:
-                self.reasoning_parser = ReasoningParserManager.get_reasoning_parser(
-                    reasoning_parser
-                )
-                assert self.reasoning_parser is not None
-            except Exception as e:
-                raise TypeError(f"{reasoning_parser=} has not been registered") from e
-
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
         self.enable_force_include_usage = enable_force_include_usage
         self.default_sampling_params = self.model_config.get_diff_sampling_param()

From a25f2adee9d66bf16128f2a8f399c558fd181647 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Sat, 11 Oct 2025 05:44:43 -0700
Subject: [PATCH 11/30] [compile] Add
 patched_fused_scaled_matmul_reduce_scatter (#26604)

Signed-off-by: angelayi <yiangela7@gmail.com>
---
 tests/compile/test_async_tp.py        | 26 ++++++--
 vllm/compilation/collective_fusion.py |  4 +-
 vllm/distributed/parallel_state.py    | 95 +++++++++++++++++++++++++++
 3 files changed, 119 insertions(+), 6 deletions(-)

diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
index 88ad4f81df505..d396d3940f67f 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@@ -142,7 +142,7 @@ class TestScaledMMRSModel(_BaseScaledMMModel):
         return [torch.ops.vllm.reduce_scatter.default]
 
     def ops_in_model_after(self):
-        return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default]
+        return [torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter.default]
 
 
 class TestAGScaledMMModel(_BaseScaledMMModel):
@@ -195,7 +195,7 @@ class TestCutlassScaledMMRSModel(_BaseScaledMMModel):
         return [torch.ops.vllm.reduce_scatter.default]
 
     def ops_in_model_after(self):
-        return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default]
+        return [torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter.default]
 
 
 class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
@@ -243,9 +243,15 @@ class TestAGCutlassScaledMMModel(_BaseScaledMMModel):
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dynamic", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_async_tp_pass_replace(
-    test_model: str, batch_size: int, seq_len: int, hidden_size: int, dtype: torch.dtype
+    test_model: str,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    dynamic: bool,
 ):
     if (
         test_model
@@ -269,7 +275,15 @@ def test_async_tp_pass_replace(
         # torch.distributed and cuda
         torch.multiprocessing.spawn(
             fn,
-            args=(num_processes, test_model, batch_size, seq_len, hidden_size, dtype),
+            args=(
+                num_processes,
+                test_model,
+                batch_size,
+                seq_len,
+                hidden_size,
+                dtype,
+                dynamic,
+            ),
             nprocs=nprocs,
         )
 
@@ -284,6 +298,7 @@ def async_tp_pass_on_test_model(
     seq_len: int,
     hidden_size: int,
     dtype: torch.dtype,
+    dynamic: bool,
 ):
     current_platform.seed_everything(0)
 
@@ -331,6 +346,9 @@ def async_tp_pass_on_test_model(
         (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
     )
 
+    if dynamic:
+        torch._dynamo.mark_dynamic(hidden_states, 0)
+
     compiled_model = torch.compile(model, backend=backend)
     compiled_model(hidden_states)
 
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 970d390f32b45..988a1069cd9e7 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -172,7 +172,7 @@ class ScaledMMReduceScatterPattern(BasePattern):
             # Calculate output shape: input @ mat2 with scatter_dim reduced
             output_shape = [*input.shape[:-1], mat2.shape[1]]
             scatter_dim = 0
-            gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
+            gemm_rs = torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter(
                 input,
                 mat2,
                 scale_a,
@@ -307,7 +307,7 @@ class CutlassScaledMMReduceScatterPattern(BasePattern):
             # Calculate output shape: input @ mat2 with scatter_dim reduced
             output_shape = [*input.shape[:-1], mat2.shape[1]]
             scatter_dim = 0
-            gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
+            gemm_rs = torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter(
                 input,
                 mat2,
                 scale_a,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index aee5507ade467..cb5a75c59f096 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -37,6 +37,8 @@ from unittest.mock import patch
 
 import torch
 import torch.distributed
+import torch.distributed._functional_collectives as funcol
+import torch.distributed._symmetric_memory
 from torch.distributed import Backend, ProcessGroup
 from typing_extensions import deprecated
 
@@ -159,6 +161,90 @@ def all_gather_fake(
     return torch.empty(new_shape, dtype=tensor.dtype, device=tensor.device)
 
 
+def patched_fused_scaled_matmul_reduce_scatter_fake(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    reduce_op: str,
+    orig_scatter_dim: int,
+    scatter_dim_after_maybe_reshape: int,
+    group_name: str,
+    output_shape: list[int],
+    bias: torch.Tensor | None = None,
+    result_scale: torch.Tensor | None = None,
+    out_dtype: torch.dtype | None = None,
+    use_fast_accum: bool = False,
+) -> torch.Tensor:
+    # Copied from
+    # https://github.com/pytorch/pytorch/blob/50c338c2da905062449e4d9ac807832d1b5cd90e/torch/distributed/_symmetric_memory/__init__.py#L1189
+    if A_scale.numel() > 1:
+        if A_scale.shape[:-1] != A.shape[:-1]:
+            raise ValueError(
+                "For row-wise scaling, the leading dims of A_scale "
+                "must match the leading dims of A "
+                f"(A shape: {A.shape}, A_scale shape: {A_scale.shape})"
+            )
+        A_scale = A_scale.flatten(0, -2).contiguous()
+    elif A_scale.numel() != 1:
+        raise ValueError(
+            "Invalid A_scale shape "
+            f"(A shape: {A.shape}, A_scale shape: {A_scale.shape})"
+        )
+
+    C = torch._scaled_mm(
+        A.flatten(0, -2).contiguous(),
+        B,
+        A_scale,
+        B_scale,
+        bias,
+        result_scale,
+        out_dtype,
+        use_fast_accum,
+    )
+    C = C.view(*output_shape[:-1], B.shape[1])
+    res = funcol.reduce_scatter_tensor(
+        C,
+        reduce_op,
+        orig_scatter_dim,  # need original scatter dim for 3D+ output tensor here
+        group_name,
+    )
+    res = funcol.wait_tensor(res)
+    return res
+
+
+def patched_fused_scaled_matmul_reduce_scatter(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_scale: torch.Tensor,
+    reduce_op: str,
+    orig_scatter_dim: int,
+    scatter_dim_after_maybe_reshape: int,
+    group_name: str,
+    output_shape: list[int],
+    bias: torch.Tensor | None = None,
+    result_scale: torch.Tensor | None = None,
+    out_dtype: torch.dtype | None = None,
+    use_fast_accum: bool = False,
+) -> torch.Tensor:
+    return torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
+        A,
+        B,
+        A_scale,
+        B_scale,
+        reduce_op,
+        orig_scatter_dim,
+        scatter_dim_after_maybe_reshape,
+        group_name,
+        output_shape,
+        bias,
+        result_scale,
+        out_dtype,
+        use_fast_accum,
+    )
+
+
 if supports_custom_op():
     direct_register_custom_op(
         op_name="all_reduce",
@@ -178,6 +264,15 @@ if supports_custom_op():
         fake_impl=all_gather_fake,
     )
 
+    # TODO: Remove this once the pytorch fix
+    # (https://github.com/pytorch/pytorch/pull/165086) gets released,
+    # in either 2.9.1 or 2.10
+    direct_register_custom_op(
+        op_name="patched_fused_scaled_matmul_reduce_scatter",
+        op_func=patched_fused_scaled_matmul_reduce_scatter,
+        fake_impl=patched_fused_scaled_matmul_reduce_scatter_fake,
+    )
+
 
 class GroupCoordinator:
     """

From 9d6cff3edeb2421699671881592fd7558946695e Mon Sep 17 00:00:00 2001
From: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Date: Sat, 11 Oct 2025 20:58:33 +0800
Subject: [PATCH 12/30] [Bugfix][Qwen3VL] fix deepstack in qwen3vl (#26626)

Signed-off-by: liuye.hj <liuye.hj@alibaba-inc.com>
Signed-off-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: liuye.hj <liuye.hj@alibaba-inc.com>
---
 vllm/model_executor/models/qwen3_vl.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 1e6c3485c4d60..6a7d2eaeab3b8 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -1702,12 +1702,6 @@ class Qwen3VLForConditionalGeneration(
         )
 
         if deepstack_input_embeds is not None:
-            deepstack_input_embeds = (
-                torch.zeros_like(inputs_embeds)
-                .unsqueeze(0)
-                .repeat(self.deepstack_num_level, 1, 1)
-                .contiguous()
-            )
             self._set_deepstack_input_embeds(deepstack_input_embeds)
 
         return inputs_embeds

From f0a30a067bacb9f3aaec1cc7a7efe005d6b2ff30 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 11 Oct 2025 23:21:33 +0800
Subject: [PATCH 13/30] [Bugfix] Fix qwen-moe packed_modules_mapping (#26634)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/interfaces.py |  2 +-
 vllm/model_executor/models/qwen2_moe.py  | 18 +++++++++++++-----
 vllm/model_executor/models/qwen3_moe.py  | 14 +++++++++-----
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 38c9d5abb5877..68915d60ef480 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -325,7 +325,7 @@ class SupportsLoRA(Protocol):
     # are empty by default.
     embedding_modules: ClassVar[dict[str, str]] = {}
     embedding_padding_modules: ClassVar[list[str]] = []
-    packed_modules_mapping: ClassVar[dict[str, list[str]]] = {}
+    packed_modules_mapping: dict[str, list[str]] = {}
 
 
 # We can't use runtime_checkable with ClassVar for issubclass checks
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index c57299a2d390f..7251e7b2eea49 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -534,11 +534,7 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             "q_proj",
             "k_proj",
             "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
+        ]
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -547,6 +543,18 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
+        # Only perform the following mapping when Qwen2MoeMLP exists
+        if (
+            getattr(config, "mlp_only_layers", [])
+            or config.shared_expert_intermediate_size > 0
+        ):
+            self.packed_modules_mapping["gate_up_proj"] = (
+                [
+                    "gate_proj",
+                    "up_proj",
+                ],
+            )
+
         self.model = Qwen2MoeModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 825272535a450..0769378933d52 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -634,11 +634,7 @@ class Qwen3MoeForCausalLM(
             "q_proj",
             "k_proj",
             "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
+        ]
     }
 
     fall_back_to_pt_during_load = False
@@ -649,6 +645,14 @@ class Qwen3MoeForCausalLM(
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
+        # Only perform the following mapping when Qwen3MoeMLP exists
+        if getattr(config, "mlp_only_layers", []):
+            self.packed_modules_mapping["gate_up_proj"] = (
+                [
+                    "gate_proj",
+                    "up_proj",
+                ],
+            )
         self.model = Qwen3MoeModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )

From 5be7ca1b99c91751c515371298d2e9c3ff21941c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 12 Oct 2025 01:45:32 +0800
Subject: [PATCH 14/30] [Benchmark] Support Infinity API (#26641)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/benchmarks/datasets.py                  |   2 +-
 vllm/benchmarks/lib/endpoint_request_func.py | 123 ++++++++++++++-----
 2 files changed, 96 insertions(+), 29 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 7ffc21905924c..8e71a7bfb1293 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1584,7 +1584,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
 
         if dataset_class.IS_MULTIMODAL and not (
             args.backend in ("openai-chat", "openai-audio")
-            or "openai-embeddings-" in args.backend
+            or "embeddings-" in args.backend
         ):
             # multi-modal benchmark is only available on OpenAI Chat
             # endpoint-type.
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
index 34dce5edb0c74..28146ce6200d1 100644
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -581,29 +581,6 @@ async def async_request_openai_embeddings_chat(
     )
 
 
-async def async_request_openai_embeddings_clip(
-    request_func_input: RequestFuncInput,
-    session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    if request_func_input.multi_modal_content:
-        # Image input
-        request_func_input.prompt = ""
-
-    # max_model_len=77 is too short for most datasets,
-    # so by default we truncate the prompt to max_model_len
-    if request_func_input.extra_body is None:
-        request_func_input.extra_body = {}
-    if "truncate_prompt_tokens" not in request_func_input.extra_body:
-        request_func_input.extra_body["truncate_prompt_tokens"] = -1
-
-    return await async_request_openai_embeddings_chat(
-        request_func_input,
-        session,
-        pbar=pbar,
-    )
-
-
 def _try_extract_request_idx(request_func_input: RequestFuncInput):
     if request_func_input.request_id:
         match = re.search(r"(\d+)$", request_func_input.request_id)
@@ -616,11 +593,20 @@ def _try_extract_request_idx(request_func_input: RequestFuncInput):
     return None
 
 
-async def async_request_openai_embeddings_vlm2vec(
-    request_func_input: RequestFuncInput,
-    session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
+def _preprocess_clip(request_func_input: RequestFuncInput):
+    if request_func_input.multi_modal_content:
+        # Image input
+        request_func_input.prompt = ""
+
+    # max_model_len=77 is too short for most datasets,
+    # so by default we truncate the prompt to max_model_len
+    if request_func_input.extra_body is None:
+        request_func_input.extra_body = {}
+    if "truncate_prompt_tokens" not in request_func_input.extra_body:
+        request_func_input.extra_body["truncate_prompt_tokens"] = -1
+
+
+def _preprocess_vlm2vec(request_func_input: RequestFuncInput):
     if request_func_input.multi_modal_content:
         request_idx = _try_extract_request_idx(request_func_input)
 
@@ -637,6 +623,28 @@ async def async_request_openai_embeddings_vlm2vec(
                 f"{request_func_input.prompt}"
             )
 
+
+async def async_request_openai_embeddings_clip(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    _preprocess_clip(request_func_input)
+
+    return await async_request_openai_embeddings_chat(
+        request_func_input,
+        session,
+        pbar=pbar,
+    )
+
+
+async def async_request_openai_embeddings_vlm2vec(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    _preprocess_vlm2vec(request_func_input)
+
     return await async_request_openai_embeddings_chat(
         request_func_input,
         session,
@@ -645,6 +653,61 @@ async def async_request_openai_embeddings_vlm2vec(
     )
 
 
+async def async_request_infinity_embeddings(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    _validate_api_url(api_url, "Infinity Embeddings API", "embeddings")
+
+    payload = {
+        "model": request_func_input.model_name
+        if request_func_input.model_name
+        else request_func_input.model,
+    }
+
+    if request_func_input.prompt:
+        payload["input"] = request_func_input.prompt
+    else:
+        mm_content = request_func_input.multi_modal_content
+        assert isinstance(mm_content, dict)
+
+        mm_type = mm_content["type"]
+        payload["input"] = mm_content[mm_type]["url"]
+        payload["modality"] = mm_type.split("_", 1)[0]
+
+    _update_payload_common(payload, request_func_input)
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+    _update_headers_common(headers, request_func_input)
+
+    return await _run_openai_embeddings(
+        session,
+        api_url,
+        payload=payload,
+        headers=headers,
+        pbar=pbar,
+    )
+
+
+async def async_request_infinity_embeddings_clip(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    _preprocess_clip(request_func_input)
+
+    return await async_request_infinity_embeddings(
+        request_func_input,
+        session,
+        pbar=pbar,
+    )
+
+
 # TODO: Add more request functions for different API protocols.
 ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
     "vllm": async_request_openai_completions,
@@ -655,6 +718,10 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
     "openai-embeddings-chat": async_request_openai_embeddings_chat,
     "openai-embeddings-clip": async_request_openai_embeddings_clip,
     "openai-embeddings-vlm2vec": async_request_openai_embeddings_vlm2vec,
+    # Infinity embedding server: https://github.com/michaelfeil/infinity
+    "infinity-embeddings": async_request_infinity_embeddings,
+    "infinity-embeddings-clip": async_request_infinity_embeddings_clip,
+    # (Infinity embedding server does not support vlm2vec)
 }
 
 OPENAI_COMPATIBLE_BACKENDS = [

From 0cd103e7cbf0315c69434870c4973ded2c5d99e5 Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Sat, 11 Oct 2025 13:50:57 -0700
Subject: [PATCH 15/30] =?UTF-8?q?CP:=20make=20correct=5Fattn=5Fout=20robus?=
 =?UTF-8?q?t=20to=204=E2=80=91D=20views=20and=20fix=20Triton=20arg=20bindi?=
 =?UTF-8?q?ng=20(#26509)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Huamin Li <3ericli@gmail.com>
---
 vllm/attention/ops/common.py | 52 +++++++++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index 097fbae68cda5..1234e1b2e46a8 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -117,14 +117,52 @@ def correct_attn_out(
     if ctx is None:
         ctx = CPTritonContext()
 
-    lse = torch.empty_like(lses[0])
+    # --- Normalize to 3D views ---
+    if out.ndim == 4 and out.shape[1] == 1:
+        out = out.squeeze(1)
+    assert out.ndim == 3, f"expected out [B,H,D] or [B,1,H,D], got {tuple(out.shape)}"
 
-    grid = (out.shape[0], out.shape[1], 1)
-    regular_args = (out, out, lses, lse, *out.stride(), *lses.stride(), cp_rank)
-    const_args = {
-        "HEAD_DIM": out.shape[-1],
-        "N_ROUNDED": lses.shape[0],
-    }
+    if lses.ndim == 4 and lses.shape[-1] == 1:
+        lses = lses.squeeze(-1)
+    if lses.ndim == 4 and lses.shape[1] == 1:
+        lses = lses.squeeze(1)
+    assert lses.ndim == 3, (
+        f"expected lses [N,B,H] (optionally with a 1-sized extra dim), "
+        f"got {tuple(lses.shape)}"
+    )
+
+    B, H, D = out.shape
+    N = lses.shape[0]
+
+    # Strides after we normalized shapes to 3-D views.  The kernel computes
+    # offsets for `vlse_ptr` using lses_stride_B/H, so the output buffer must
+    # have the same B/H stride layout as a slice of `lses`.
+    o_sB, o_sH, o_sD = out.stride()
+    l_sN, l_sB, l_sH = lses.stride()
+
+    # Allocate LSE with the same B/H strides as `lses` so writes land correctly
+    # even when `lses` is a non-contiguous view (e.g., 4-D to 3-D squeeze).
+    lse = torch.empty_strided(
+        (B, H), (l_sB, l_sH), device=lses.device, dtype=lses.dtype
+    )
+
+    # Kernel launch config
+    grid = (B, H, 1)
+
+    regular_args = (
+        out,
+        out,
+        lses,
+        lse,
+        o_sB,
+        o_sH,
+        o_sD,
+        l_sN,
+        l_sB,
+        l_sH,
+        cp_rank,
+    )
+    const_args = {"HEAD_DIM": D, "N_ROUNDED": N}
 
     ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args, **const_args)
     return out, lse

From 01653a917b140535cb6768c995a4bd5ea3253120 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Sat, 11 Oct 2025 14:03:14 -0700
Subject: [PATCH 16/30] [compile] Fix inductor partition config (#26645)

Signed-off-by: angelayi <yiangela7@gmail.com>
---
 vllm/config/compilation.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index e65728ba7f4e1..4209f3a9731c1 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -709,9 +709,7 @@ class CompilationConfig:
             return self.level == CompilationLevel.PIECEWISE
 
         # Inductor partition case
-        return (
-            self.level > CompilationLevel.NO_COMPILATION and self.backend == "inductor"
-        )
+        return self.level > CompilationLevel.NO_COMPILATION and self.use_inductor
 
     def custom_op_log_check(self):
         """

From c5c8f5ea59f060f68b2257bb4d2066a264fc865c Mon Sep 17 00:00:00 2001
From: Haisheng Chen <60504847+HsChen-sys@users.noreply.github.com>
Date: Sat, 11 Oct 2025 19:40:47 -0700
Subject: [PATCH 17/30] [EPLB] Support ernie4.5-moe (#22100)

Signed-off-by: Haisheng Chen <langzs335@outlook.com>
Signed-off-by: Haisheng Chen <60504847+HsChen-sys@users.noreply.github.com>
Signed-off-by: Haisheng Chen <hac048@ucsd.edu>
Co-authored-by: Haisheng Chen <langzs335@outlook.com>
---
 vllm/model_executor/models/ernie45_moe.py | 139 ++++++++++++++++++++--
 1 file changed, 132 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index 7516cb5abaf9a..f0360d55a2e57 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -33,8 +33,12 @@ from transformers import PretrainedConfig
 
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import SharedFusedMoE
@@ -58,7 +62,7 @@ from vllm.model_executor.model_loader.weight_utils import (
 )
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
@@ -118,12 +122,34 @@ class Ernie4_5_MoeMoE(nn.Module):
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        enable_eplb: bool = False,
     ):
         super().__init__()
 
         layer_idx = extract_layer_index(prefix)
         self.layer_idx = layer_idx
         self.tp_size = get_tensor_model_parallel_world_size()
+
+        self.moe_num_shared_experts = getattr(config, "moe_num_shared_experts", None)
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts: int = config.moe_num_experts
+        self.n_shared_experts: int = self.moe_num_shared_experts
+
+        # Load balancing settings.
+        vllm_config = get_current_vllm_config()
+        parallel_config = vllm_config.parallel_config
+        self.enable_eplb = enable_eplb
+
+        self.n_redundant_experts = parallel_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
         self.has_shared_experts = getattr(config, "moe_num_shared_experts", 0) > 0
 
         if self.tp_size > config.moe_num_experts:
@@ -171,6 +197,8 @@ class Ernie4_5_MoeMoE(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
             e_score_correction_bias=self.gate.e_score_correction_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
         )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -298,6 +326,7 @@ class Ernie4_5_MoeDecoderLayer(nn.Module):
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        enable_eplb: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -338,7 +367,10 @@ class Ernie4_5_MoeDecoderLayer(nn.Module):
             and layer_idx <= moe_layer_end_index
         ):
             self.mlp = Ernie4_5_MoeMoE(
-                config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+                enable_eplb=enable_eplb,
             )
         else:
             self.mlp = Ernie4_5_MoeMLP(
@@ -393,6 +425,9 @@ class Ernie4_5_MoeModel(nn.Module):
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.config = config
+        parallel_config = vllm_config.parallel_config
+        enable_eplb = parallel_config.enable_eplb
+        self.num_redundant_experts = parallel_config.num_redundant_experts
 
         if get_pp_group().is_first_rank:
             self.embed_tokens = VocabParallelEmbedding(
@@ -411,6 +446,7 @@ class Ernie4_5_MoeModel(nn.Module):
                 cache_config=cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
+                enable_eplb=enable_eplb,
             ),
             prefix=f"{prefix}.layers",
         )
@@ -465,6 +501,7 @@ class Ernie4_5_MoeModel(nn.Module):
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
             num_experts=self.config.moe_num_experts,
+            num_redundant_experts=self.num_redundant_experts,
         )
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
@@ -513,15 +550,22 @@ class Ernie4_5_MoeModel(nn.Module):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                is_expert_weight = False
                 for mapping in expert_params_mapping:
                     param_name, weight_name, expert_id, shard_id = mapping
 
                     if weight_name not in name:
                         continue
 
-                    name = name.replace(weight_name, param_name)
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
                     # Skip layers on other devices.
-                    if is_pp_missing_parameter(name, self):
+                    if is_pp_missing_parameter(name_mapped, self):
                         continue
 
                     # Skip loading extra bias for GPTQ models.
@@ -541,6 +585,12 @@ class Ernie4_5_MoeModel(nn.Module):
                     )
                     break
                 else:
+                    if is_expert_weight:
+                        # We've checked that this is an expert weight
+                        # However it's not mapped locally to this rank
+                        # So we simply skip it
+                        continue
+
                     # Skip loading extra bias for GPTQ models.
                     if (
                         name.endswith(".bias") or name.endswith("_bias")
@@ -563,7 +613,7 @@ class Ernie4_5_MoeModel(nn.Module):
         return loaded_params
 
 
-class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExperts):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -605,6 +655,81 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             self.model.make_empty_intermediate_tensors
         )
 
+        self.expert_weights = []
+
+        # Set MoE hyperparameters
+        moe_layers_indices = [
+            i
+            for i in range(config.num_hidden_layers)
+            if (
+                i >= config.moe_layer_start_index
+                and i <= config.moe_layer_end_index
+                and (i + 1) % config.moe_layer_interval == 0
+            )
+        ]
+        self.num_moe_layers = len(moe_layers_indices)
+        self.num_expert_groups = 1
+
+        self.moe_layers: list[SharedFusedMoE] = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, Ernie4_5_MoeDecoderLayer)
+            if isinstance(layer.mlp, Ernie4_5_MoeMoE):
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None:
+            logger.warning("No Ernie4_5_MoeMoE layer found in model.layers.")
+            self.num_logical_experts = 0
+            self.num_physical_experts = 0
+            self.num_local_physical_experts = 0
+            self.num_routed_experts = 0
+            self.num_shared_experts = 0
+            self.num_redundant_experts = 0
+        else:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ) -> None:
+        for layer_idx, layer in enumerate(self.moe_layers):
+            # Register the expert weights.
+            self.expert_weights.append(layer.get_expert_weights())
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.model.layers:
+            if isinstance(layer.mlp, Ernie4_5_MoeMoE):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 

From 4ca204055ef094c933c015847be95fa87ed6443e Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Sun, 12 Oct 2025 14:04:44 +0800
Subject: [PATCH 18/30] Add @noooop to codeowner for pooling models (#26652)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 .github/CODEOWNERS | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index dbcad3aa308f5..61ac9fefc59f4 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -121,3 +121,11 @@ mkdocs.yaml @hmellor
 
 # KVConnector installation files
 /requirements/kv_connectors.txt @NickLucche
+
+# Pooling models
+/examples/*/pooling/ @noooop
+/tests/models/*/pooling* @noooop
+/tests/entrypoints/pooling @noooop
+/vllm/config/pooler.py @noooop
+/vllm/pooling_params.py @noooop
+/vllm/model_executor/layers/pooler.py @noooop

From 82e64c7a204671423c2c7914cbabdf06bc89c0c8 Mon Sep 17 00:00:00 2001
From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Date: Sun, 12 Oct 2025 12:27:50 +0400
Subject: [PATCH 19/30] [PERF] [Qwen3-next] Speed up gated RMSNorm (#26207)

Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
Signed-off-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/kernels/test_fla_layernorm_guard.py     | 388 ++++++++++++++++++
 .../layers/fla/ops/layernorm_guard.py         | 120 ++++--
 2 files changed, 475 insertions(+), 33 deletions(-)
 create mode 100644 tests/kernels/test_fla_layernorm_guard.py

diff --git a/tests/kernels/test_fla_layernorm_guard.py b/tests/kernels/test_fla_layernorm_guard.py
new file mode 100644
index 0000000000000..f944c6dcfa73b
--- /dev/null
+++ b/tests/kernels/test_fla_layernorm_guard.py
@@ -0,0 +1,388 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.fla.ops.layernorm_guard import (
+    layer_norm_fwd,
+    layernorm_fn,
+    rms_norm_ref,
+)
+from vllm.platforms import current_platform
+
+
+def layer_norm_ref(
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+):
+    """Reference implementation for both layer norm and RMS norm."""
+    if is_rms_norm:
+        # Use the imported rms_norm_ref for RMS norm cases
+        return rms_norm_ref(
+            x,
+            weight,
+            bias,
+            z=z,
+            eps=eps,
+            group_size=group_size,
+            norm_before_gate=norm_before_gate,
+            upcast=True,
+        )
+
+    # Layer norm implementation
+    dtype = x.dtype
+    x = x.float()
+    weight = weight.float()
+    bias = bias.float() if bias is not None else None
+    z = z.float() if z is not None else None
+
+    if z is not None and not norm_before_gate:
+        x = x * F.silu(z)
+
+    if group_size is None:
+        # Layer norm: subtract mean
+        mean = x.mean(dim=-1, keepdim=True)
+        var = ((x - mean).square()).mean(dim=-1, keepdim=True)
+        rstd = 1 / torch.sqrt(var + eps)
+        out = (x - mean) * rstd * weight
+        if bias is not None:
+            out = out + bias
+    else:
+        # Group norm
+        from einops import rearrange
+
+        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
+        mean = x_group.mean(dim=-1, keepdim=True)
+        var = ((x_group - mean).square()).mean(dim=-1, keepdim=True)
+        rstd = 1 / torch.sqrt(var + eps)
+        x_group = (x_group - mean) * rstd
+        out = rearrange(x_group, "... g d -> ... (g d)") * weight
+        if bias is not None:
+            out = out + bias
+
+    if z is not None and norm_before_gate:
+        out *= F.silu(z)
+
+    return out.to(dtype)
+
+
+DTYPES = [torch.bfloat16, torch.float32]
+# Test various M sizes to ensure rows_per_block logic works correctly
+NUM_TOKENS = [
+    1,
+    7,
+    16,
+    63,
+    128,
+    256,
+    512,
+    1024,
+    2048,
+    4096,
+    5789,
+    8189,
+    8191,
+    16383,
+    32767,
+]
+HIDDEN_SIZES = [64, 128, 256, 1024]
+GROUP_SIZES = [None, 64, 128]  # None means full hidden size
+NORM_BEFORE_GATE = [True, False]
+IS_RMS_NORM = [True, False]
+SEEDS = [0, 42]
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
+@torch.inference_mode()
+def test_layer_norm_fwd_basic(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    is_rms_norm: bool,
+) -> None:
+    """Test basic layer norm forward pass without z (gate) tensor."""
+    current_platform.seed_everything(seed)
+    device = torch.device("cuda:0")
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run the triton kernel
+    out, mean, rstd = layer_norm_fwd(
+        x, weight, bias, eps, z=None, is_rms_norm=is_rms_norm
+    )
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=is_rms_norm)
+
+    # Check outputs
+    assert out.shape == x.shape
+    assert out.dtype == x.dtype
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+    # Check mean and rstd shapes
+    if not is_rms_norm:
+        assert mean.shape == (num_tokens,)
+    assert rstd.shape == (num_tokens,)
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", [128, 256, 1024])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("norm_before_gate", NORM_BEFORE_GATE)
+@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
+@torch.inference_mode()
+def test_layer_norm_fwd_with_gate(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    norm_before_gate: bool,
+    is_rms_norm: bool,
+) -> None:
+    """Test layer norm forward pass with z (gate) tensor."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    z = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run the triton kernel
+    out, mean, rstd = layer_norm_fwd(
+        x,
+        weight,
+        bias,
+        eps,
+        z=z,
+        norm_before_gate=norm_before_gate,
+        is_rms_norm=is_rms_norm,
+    )
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(
+        x,
+        weight,
+        bias,
+        z=z,
+        eps=eps,
+        norm_before_gate=norm_before_gate,
+        is_rms_norm=is_rms_norm,
+    )
+
+    # Check outputs
+    assert out.shape == x.shape
+    assert out.dtype == x.dtype
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("num_tokens", [128, 512])
+@pytest.mark.parametrize("hidden_size", [512, 1024])
+@pytest.mark.parametrize("group_size", [64, 128, 256])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM)
+@torch.inference_mode()
+def test_layer_norm_fwd_with_groups(
+    num_tokens: int,
+    hidden_size: int,
+    group_size: int,
+    dtype: torch.dtype,
+    is_rms_norm: bool,
+) -> None:
+    """Test layer norm forward pass with group normalization."""
+    if hidden_size % group_size != 0:
+        pytest.skip(
+            f"hidden_size {hidden_size} not divisible by group_size {group_size}"
+        )
+
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    ngroups = hidden_size // group_size
+
+    # Run the triton kernel
+    out, mean, rstd = layer_norm_fwd(
+        x, weight, bias, eps, z=None, group_size=group_size, is_rms_norm=is_rms_norm
+    )
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(
+        x, weight, bias, z=None, eps=eps, group_size=group_size, is_rms_norm=is_rms_norm
+    )
+
+    # Check outputs
+    assert out.shape == x.shape
+    assert out.dtype == x.dtype
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+    # Check mean and rstd shapes for groups
+    if not is_rms_norm:
+        assert mean.shape == (ngroups * num_tokens,)
+    assert rstd.shape == (ngroups * num_tokens,)
+
+
+@pytest.mark.parametrize("num_tokens", [7, 63, 128, 513, 1024, 2049])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_layer_norm_rows_per_block(
+    num_tokens: int,
+    dtype: torch.dtype,
+) -> None:
+    """Test that rows_per_block logic works correctly for various M sizes."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+    hidden_size = 1024
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run the triton kernel
+    out, mean, rstd = layer_norm_fwd(x, weight, bias, eps, z=None, is_rms_norm=False)
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)
+
+    # Check outputs
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@torch.inference_mode()
+def test_strided_input(dtype: torch.dtype) -> None:
+    """Test that the kernel handles non-contiguous (strided)
+    inputs correctly."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+    num_tokens = 128
+    hidden_size = 1024
+
+    # Create a larger tensor and take a strided slice
+    x_large = torch.randn(num_tokens, hidden_size * 2, dtype=dtype, device=device)
+    x = x_large[:, :hidden_size]
+
+    # Make it contiguous for the kernel
+    x_contiguous = x.contiguous()
+
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run the triton kernel with contiguous input
+    out, mean, rstd = layer_norm_fwd(
+        x_contiguous, weight, bias, eps, z=None, is_rms_norm=False
+    )
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(
+        x_contiguous, weight, bias, z=None, eps=eps, is_rms_norm=False
+    )
+
+    # Check outputs
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize("num_tokens", [1, 128, 2048])
+@pytest.mark.parametrize("hidden_size", [768, 4096])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_output_buffer_provided(
+    num_tokens: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+) -> None:
+    """Test that the kernel works when an output buffer is provided."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+
+    # Create inputs
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Pre-allocate output buffer
+    out_buffer = torch.empty_like(x)
+
+    # Run the triton kernel with provided output
+    out, mean, rstd = layer_norm_fwd(
+        x, weight, bias, eps, z=None, out=out_buffer, is_rms_norm=False
+    )
+
+    # Check that the provided buffer was used
+    assert out.data_ptr() == out_buffer.data_ptr()
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)
+
+    # Check outputs
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (4, 16, 1024),  # 3D tensor
+        (2, 8, 512, 256),  # 4D tensor
+    ],
+)
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@torch.inference_mode()
+def test_multidimensional_input(
+    shape: tuple,
+    dtype: torch.dtype,
+) -> None:
+    """Test that the autograd function handles multidimensional inputs."""
+    current_platform.seed_everything(42)
+    device = torch.device("cuda:0")
+    hidden_size = shape[-1]
+
+    # Create inputs
+    x = torch.randn(*shape, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    bias = torch.randn(hidden_size, dtype=dtype, device=device)
+    eps = 1e-6
+
+    # Run through autograd function
+    out = layernorm_fn(x, weight, bias, z=None, eps=eps)
+
+    # Run reference implementation
+    ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False)
+
+    # Check outputs
+    assert out.shape == x.shape
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+
+if __name__ == "__main__":
+    # Run a quick smoke test
+    test_layer_norm_fwd_basic(128, 1024, torch.float16, 42, False)
+    test_layer_norm_fwd_with_gate(128, 1024, torch.float16, True, False)
+    test_layer_norm_rows_per_block(513, torch.float16)
+    print("All smoke tests passed!")
diff --git a/vllm/model_executor/layers/fla/ops/layernorm_guard.py b/vllm/model_executor/layers/fla/ops/layernorm_guard.py
index 655cdb3f30eb1..6d039efe58767 100644
--- a/vllm/model_executor/layers/fla/ops/layernorm_guard.py
+++ b/vllm/model_executor/layers/fla/ops/layernorm_guard.py
@@ -13,6 +13,7 @@
 # This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
 # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
 
+from functools import lru_cache
 from typing import Optional
 
 import torch
@@ -21,6 +22,7 @@ import torch.nn.functional as F
 from einops import rearrange
 
 from vllm.triton_utils import tl, triton
+from vllm.utils import cdiv, next_power_of_2
 
 from .utils import input_guard
 
@@ -76,55 +78,103 @@ def layer_norm_fwd_kernel(
     stride_y_row,
     stride_z_row,
     M,  # number of rows in X
-    N,  # number of columns in X
+    N: tl.constexpr,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     BLOCK_N: tl.constexpr,
+    ROWS_PER_BLOCK: tl.constexpr,
     HAS_BIAS: tl.constexpr,
     HAS_Z: tl.constexpr,
     NORM_BEFORE_GATE: tl.constexpr,
     IS_RMS_NORM: tl.constexpr,
 ):
-    # Map the program id to the row of X and Y it should compute.
-    row = tl.program_id(0)
+    # Map the program id to the starting row of X and Y it should compute.
+    row_start = tl.program_id(0) * ROWS_PER_BLOCK
     group = tl.program_id(1)
-    X += row * stride_x_row + group * N
-    Y += row * stride_y_row + group * N
-    if HAS_Z:
-        Z += row * stride_z_row + group * N
-    if not IS_RMS_NORM:
-        Mean += group * M
-    Rstd += group * M
-    W += group * N
-    if HAS_BIAS:
-        B += group * N
-    # Compute mean and variance
+
+    # Create 2D tile: [ROWS_PER_BLOCK, BLOCK_N]
+    rows = row_start + tl.arange(0, ROWS_PER_BLOCK)
     cols = tl.arange(0, BLOCK_N)
-    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
+
+    # Compute offsets for 2D tile
+    row_offsets = rows[:, None] * stride_x_row
+    col_offsets = cols[None, :] + group * N
+
+    # Base pointers
+    X_base = X + row_offsets + col_offsets
+    Y_base = Y + rows[:, None] * stride_y_row + col_offsets
+
+    # Create mask for valid rows and columns
+    row_mask = rows[:, None] < M
+    col_mask = cols[None, :] < N
+    mask = row_mask & col_mask
+
+    # Load input data with 2D tile
+    x = tl.load(X_base, mask=mask, other=0.0).to(tl.float32)
+
     if HAS_Z and not NORM_BEFORE_GATE:
-        z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
+        Z_base = Z + rows[:, None] * stride_z_row + col_offsets
+        z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32)
         x *= z * tl.sigmoid(z)
+
+    # Compute mean and variance per row (reduce along axis 1)
     if not IS_RMS_NORM:
-        mean = tl.sum(x, axis=0) / N
-        tl.store(Mean + row, mean)
-        xbar = tl.where(cols < N, x - mean, 0.0)
-        var = tl.sum(xbar * xbar, axis=0) / N
+        mean = tl.sum(x, axis=1) / N  # Shape: [ROWS_PER_BLOCK]
+        # Store mean for each row
+        mean_offsets = group * M + rows
+        mean_mask = rows < M
+        tl.store(Mean + mean_offsets, mean, mask=mean_mask)
+        # Broadcast mean back to 2D for subtraction
+        xbar = tl.where(mask, x - mean[:, None], 0.0)
+        var = tl.sum(xbar * xbar, axis=1) / N  # Shape: [ROWS_PER_BLOCK]
     else:
-        xbar = tl.where(cols < N, x, 0.0)
-        var = tl.sum(xbar * xbar, axis=0) / N
-    rstd = 1 / tl.sqrt(var + eps)
-    tl.store(Rstd + row, rstd)
-    # Normalize and apply linear transformation
-    mask = cols < N
-    w = tl.load(W + cols, mask=mask).to(tl.float32)
+        xbar = tl.where(mask, x, 0.0)
+        var = tl.sum(xbar * xbar, axis=1) / N  # Shape: [ROWS_PER_BLOCK]
+        mean = 0.0  # Placeholder for RMS norm
+
+    rstd = tl.rsqrt(var + eps)  # Shape: [ROWS_PER_BLOCK]
+
+    # Store rstd for each row
+    rstd_offsets = group * M + rows
+    rstd_mask = rows < M
+    tl.store(Rstd + rstd_offsets, rstd, mask=rstd_mask)
+
+    # Load weights and biases (broadcast across rows)
+    w_offsets = cols + group * N
+    w_mask = cols < N
+    w = tl.load(W + w_offsets, mask=w_mask, other=0.0).to(tl.float32)
+
     if HAS_BIAS:
-        b = tl.load(B + cols, mask=mask).to(tl.float32)
-    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
-    y = x_hat * w + b if HAS_BIAS else x_hat * w
+        b = tl.load(B + w_offsets, mask=w_mask, other=0.0).to(tl.float32)
+
+    # Normalize and apply linear transformation
+    if not IS_RMS_NORM:
+        x_hat = (x - mean[:, None]) * rstd[:, None]
+    else:
+        x_hat = x * rstd[:, None]
+
+    y = x_hat * w[None, :] + b[None, :] if HAS_BIAS else x_hat * w[None, :]
+
     if HAS_Z and NORM_BEFORE_GATE:
-        z = tl.load(Z + cols, mask=mask).to(tl.float32)
+        Z_base = Z + rows[:, None] * stride_z_row + col_offsets
+        z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32)
         y *= z * tl.sigmoid(z)
+
     # Write output
-    tl.store(Y + cols, y, mask=mask)
+    tl.store(Y_base, y, mask=mask)
+
+
+@lru_cache
+def _get_sm_count(device: torch.device) -> int:
+    """Get and cache the SM count for a given device."""
+    props = torch.cuda.get_device_properties(device)
+    return props.multi_processor_count
+
+
+def calc_rows_per_block(M: int, device: torch.device) -> int:
+    sm_count = _get_sm_count(device)
+    rows_per_block = next_power_of_2(cdiv(M, 2 * sm_count))
+    rows_per_block = min(rows_per_block, 4)
+    return rows_per_block
 
 
 def layer_norm_fwd(
@@ -171,7 +221,10 @@ def layer_norm_fwd(
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     # heuristics for number of warps
     num_warps = min(max(BLOCK_N // 256, 1), 8)
-    grid = (M, ngroups)
+    # Calculate rows per block based on SM count
+    rows_per_block = calc_rows_per_block(M, x.device)
+    # Update grid to use rows_per_block
+    grid = (cdiv(M, rows_per_block), ngroups)
     layer_norm_fwd_kernel[grid](
         x,
         out,
@@ -187,6 +240,7 @@ def layer_norm_fwd(
         group_size,
         eps,
         BLOCK_N=BLOCK_N,
+        ROWS_PER_BLOCK=rows_per_block,
         NORM_BEFORE_GATE=norm_before_gate,
         IS_RMS_NORM=is_rms_norm,
         num_warps=num_warps,

From 76852017ea9175e1301b34162fa952bf82f02e94 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Sun, 12 Oct 2025 17:29:08 +0800
Subject: [PATCH 20/30] [MISC] Rename the torch profiler filename as
 instance_id+rank_id for merging the Profiler results of each Rank (#25867)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 vllm/config/vllm.py          | 4 ++++
 vllm/v1/worker/gpu_worker.py | 3 ++-
 vllm/v1/worker/xpu_worker.py | 3 ++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 833581035a318..e6cfcad3d6962 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -5,6 +5,7 @@ import copy
 import hashlib
 import json
 import os
+import time
 from contextlib import contextmanager
 from dataclasses import field, replace
 from functools import lru_cache
@@ -270,6 +271,9 @@ class VllmConfig:
     def __post_init__(self):
         """Verify configs are valid & consistent with each other."""
 
+        # To give each torch profile run a unique instance name.
+        self.instance_id = f"{time.time_ns()}"
+
         self.try_verify_and_update_config()
 
         if self.model_config is not None:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 4f4da73fba6e6..119e474b1fca9 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -79,6 +79,7 @@ class Worker(WorkerBase):
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
             logger.info(
                 "Profiling enabled. Traces will be saved to: %s",
                 torch_profiler_trace_dir,
@@ -101,7 +102,7 @@ class Worker(WorkerBase):
                 with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
                 with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, use_gzip=True
+                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
                 ),
             )
         else:
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index a1e54628d9ed1..31fa3f3bd6acc 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -39,6 +39,7 @@ class XPUWorker(Worker):
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
         if envs.VLLM_TORCH_PROFILER_DIR:
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
             logger.info(
                 "Profiling enabled. Traces will be saved to: %s",
                 torch_profiler_trace_dir,
@@ -61,7 +62,7 @@ class XPUWorker(Worker):
                 with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
                 with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, use_gzip=True
+                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
                 ),
             )
         else:

From 045b396d090f4a16fbba760bef86e9a24a7ba9ce Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 12 Oct 2025 17:42:42 +0800
Subject: [PATCH 21/30] [Bugfix][CI/Build] Fix failing Mteb CI (#26638)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/language/pooling_mteb_test/mteb_utils.py       | 2 +-
 tests/models/language/pooling_mteb_test/test_jina.py        | 5 +++++
 .../models/language/pooling_mteb_test/test_st_projector.py  | 1 +
 tests/models/utils.py                                       | 1 +
 vllm/model_executor/layers/layernorm.py                     | 6 +++++-
 5 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py
index d96dc90416855..65ad49fad3653 100644
--- a/tests/models/language/pooling_mteb_test/mteb_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_utils.py
@@ -191,7 +191,7 @@ def mteb_test_embed_models(
     with vllm_runner(
         model_info.name,
         runner="pooling",
-        max_model_len=None,
+        max_model_len=model_info.max_model_len,
         **vllm_extra_kwargs,
     ) as vllm_model:
         model_config = vllm_model.llm.llm_engine.model_config
diff --git a/tests/models/language/pooling_mteb_test/test_jina.py b/tests/models/language/pooling_mteb_test/test_jina.py
index 0a712b2542f3c..dbdf82af33c72 100644
--- a/tests/models/language/pooling_mteb_test/test_jina.py
+++ b/tests/models/language/pooling_mteb_test/test_jina.py
@@ -25,6 +25,11 @@ EMBEDDING_MODELS = [
         mteb_score=0.824413164,
         architecture="XLMRobertaModel",
         is_matryoshka=True,
+        # The default max length of the model is 8194, which will crash
+        # CUDAGraph due to odd length for Gemm. We set it to 8192 to avoid
+        # avoid this issue.
+        max_model_len=8192,
+        dtype="float32",
     )
 ]
 
diff --git a/tests/models/language/pooling_mteb_test/test_st_projector.py b/tests/models/language/pooling_mteb_test/test_st_projector.py
index 91b1ef828d0df..74fe4b9bcc03f 100644
--- a/tests/models/language/pooling_mteb_test/test_st_projector.py
+++ b/tests/models/language/pooling_mteb_test/test_st_projector.py
@@ -23,6 +23,7 @@ ST_PROJECTOR_MODELS = [
         architecture="Gemma3TextModel",
         mteb_score=0.7473819294684156,
         enable_test=True,
+        dtype="float32",
     ),
 ]
 
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 84697ad68d441..3d6e6cb89d62a 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -369,6 +369,7 @@ class ModelInfo:
     name: str
     architecture: str = ""
     dtype: str = "auto"
+    max_model_len: Optional[int] = None
     hf_dtype: str = "float32"
     hf_overrides: Optional[dict[str, Any]] = None
     default_pooling_type: str = ""
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 6a49ae42ca895..910f145b1f8c2 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -318,7 +318,11 @@ class GemmaRMSNorm(CustomOp):
         """PyTorch-native implementation equivalent to forward()."""
         orig_dtype = x.dtype
         if residual is not None:
-            x = x + residual.float() if orig_dtype == torch.float16 else x + residual
+            x = (
+                x.float() + residual.float()
+                if orig_dtype == torch.float16
+                else x + residual
+            )
             residual = x
 
         x = x.float()

From b91d8db873a5f4d639a5cb57288cd94ed1614bb0 Mon Sep 17 00:00:00 2001
From: Jaya Yuan <yuanyongjie.yyj@antgroup.com>
Date: Sun, 12 Oct 2025 17:58:38 +0800
Subject: [PATCH 22/30] [Bugfix][DCP] Set default CUDAGraphMode to PIECEWISE
 for DCP (#26574)

Signed-off-by: FENP <32334296+FENP@users.noreply.github.com>
---
 vllm/config/vllm.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index e6cfcad3d6962..9d156dd8d9de3 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -350,6 +350,15 @@ class VllmConfig:
                         or self.model_config.is_encoder_decoder
                     ):
                         self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+
+                    # decode context parallel do not support full cudagraphs now.
+                    if self.parallel_config.decode_context_parallel_size > 1:
+                        logger.warning(
+                            "Decode context parallel (DCP) is enabled, which is "
+                            "incompatible with full CUDA graphs. Set "
+                            "cudagraph_mode to PIECEWISE."
+                        )
+                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
                 else:
                     self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 

From 9bb38130cb19eb084d39f269cbeae2952789fafd Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Sun, 12 Oct 2025 06:39:05 -0500
Subject: [PATCH 23/30] [Bugfix] Fix GPU_ID issue in test script (#26442)

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
---
 .../nixl_integration/run_accuracy_test.sh           | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index 3b0f2d102c1ff..3bf722900df37 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -101,6 +101,12 @@ run_tests_for_model() {
   for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs
     GPU_ID=$((i % $(get_num_gpus)))
+    NEXT_GPU=${GPU_ID}
+    # If PREFILLER_TP_SIZE is more than 1
+    for (( j=1; j < PREFILLER_TP_SIZE; j++ )); do
+      NEXT_GPU=$(((GPU_ID + j) % $(get_num_gpus)))
+      GPU_ID="${GPU_ID},${NEXT_GPU}"
+    done
 
     # Calculate port number (base port + instance number)
     PORT=$((8100 + i))
@@ -136,7 +142,12 @@ run_tests_for_model() {
   # Start decode instances
   for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
-    GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
+    GPU_ID=$(((i + NEXT_GPU + 1) % $(get_num_gpus)))
+    # If DECODER_TP_SIZE is more than 1
+    for (( j=1; j < DECODER_TP_SIZE; j++ )); do
+      NEXT_GPU=$(((GPU_ID + j) % $(get_num_gpus)))
+      GPU_ID="${GPU_ID},${NEXT_GPU}"
+    done
     # Calculate port number (base port + instance number)
     PORT=$((8200 + i))
     # Calculate side channel port

From 8fcaaf6a165e661f63fc51be906bc05b0767332f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sun, 12 Oct 2025 17:51:31 +0100
Subject: [PATCH 24/30] Update `Optional[x]` -> `x | None` and `Union[x, y]` to
 `x | y` (#26633)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 benchmarks/backend_request_func.py            |  27 +-
 benchmarks/benchmark_prefix_caching.py        |   5 +-
 benchmarks/benchmark_prioritization.py        |   3 +-
 .../benchmark_serving_structured_output.py    |   7 +-
 benchmarks/benchmark_utils.py                 |  16 +-
 .../cutlass_benchmarks/sparse_benchmarks.py   |   3 +-
 .../cutlass_benchmarks/w8a8_benchmarks.py     |  11 +-
 .../fused_kernels/layernorm_rms_benchmarks.py |   9 +-
 .../kernels/bench_per_token_quant_fp8.py      |   2 +-
 .../kernels/benchmark_device_communicators.py |   6 +-
 benchmarks/kernels/benchmark_lora.py          |  21 +-
 benchmarks/kernels/benchmark_machete.py       |  31 +-
 .../kernels/benchmark_paged_attention.py      |   3 +-
 .../benchmark_per_token_group_quant.py        |   2 +-
 .../kernels/benchmark_reshape_and_cache.py    |   2 -
 .../benchmark_reshape_and_cache_flash.py      |   2 -
 benchmarks/kernels/benchmark_rmsnorm.py       |  11 +-
 benchmarks/kernels/benchmark_rope.py          |   3 +-
 .../benchmark_trtllm_decode_attention.py      |   5 +-
 .../benchmark_trtllm_prefill_attention.py     |   5 +-
 benchmarks/kernels/utils.py                   |   6 +-
 benchmarks/multi_turn/bench_dataset.py        |  22 +-
 .../benchmark_serving_multi_turn.py           |  10 +-
 .../multi_turn/convert_sharegpt_to_openai.py  |  18 +-
 .../vllm_cutlass_library_extension.py         |  15 +-
 csrc/quantization/machete/generate.py         |   5 +-
 docs/contributing/model/transcription.md      |  14 +-
 docs/design/logits_processors.md              |   6 +-
 docs/features/custom_logitsprocs.md           |   3 +-
 examples/offline_inference/audio_language.py  |  12 +-
 .../rogue_shared_storage_connector.py         |   4 +-
 .../logits_processor/custom.py                |   4 +-
 .../logits_processor/custom_req.py            |   6 +-
 .../logits_processor/custom_req_init.py       |   4 +-
 .../lora_with_quantization_inference.py       |   7 +-
 .../offline_inference/multilora_inference.py  |   6 +-
 .../prithvi_geospatial_mae.py                 |   3 +-
 examples/offline_inference/rlhf_utils.py      |   7 +-
 examples/offline_inference/vision_language.py |   6 +-
 .../vision_language_multi_image.py            |  12 +-
 .../vision_language_pooling.py                |  16 +-
 .../disagg_proxy_demo.py                      |  17 +-
 .../online_serving/kv_events_subscriber.py    |  12 +-
 .../multi_instance_data_parallel.py           |   3 +-
 .../pooling/cohere_rerank_client.py           |   4 +-
 ...ai_chat_embedding_client_for_multimodal.py |   4 +-
 .../structured_outputs/structured_outputs.py  |  10 +-
 pyproject.toml                                |   6 -
 tests/benchmarks/test_random_dataset.py       |   6 +-
 tests/ci_envs.py                              |   9 +-
 tests/compile/backend.py                      |   5 +-
 tests/compile/piecewise/test_toy_llama.py     |   6 +-
 tests/compile/test_basic_correctness.py       |   2 -
 tests/compile/test_full_graph.py              |   6 +-
 tests/compile/test_fusion_attn.py             |   5 +-
 tests/compile/test_wrapper.py                 |   7 +-
 tests/conftest.py                             | 148 ++--
 tests/detokenizer/test_stop_strings.py        |   6 +-
 tests/distributed/conftest.py                 |   7 +-
 tests/distributed/test_comm_ops.py            |   5 +-
 tests/distributed/test_context_parallel.py    |   6 +-
 tests/distributed/test_expert_parallel.py     |  20 +-
 tests/distributed/test_pipeline_parallel.py   |   8 +-
 tests/distributed/test_pp_cudagraph.py        |   8 +-
 tests/distributed/test_sequence_parallel.py   |  10 +-
 tests/engine/test_arg_utils.py                |  16 +-
 .../openai/test_async_tokenization.py         |   2 +-
 tests/entrypoints/openai/test_chat.py         |   3 +-
 .../test_completion_with_function_calling.py  |   3 +-
 .../entrypoints/openai/test_lora_resolvers.py |  11 +-
 tests/entrypoints/openai/test_serving_chat.py |   9 +-
 .../entrypoints/openai/tool_parsers/utils.py  |   9 +-
 .../openai/test_embedding_dimensions.py       |   6 +-
 .../test_api_server_process_manager.py        |   5 +-
 tests/entrypoints/test_chat_utils.py          |  14 +-
 tests/entrypoints/test_renderer.py            |   3 +-
 tests/evals/gsm8k/gsm8k_eval.py               |  13 +-
 .../attention/test_aiter_flash_attn.py        |  11 +-
 tests/kernels/attention/test_attention.py     |   7 +-
 .../attention/test_cascade_flash_attn.py      |   3 +-
 .../attention/test_cutlass_mla_decode.py      |   3 +-
 tests/kernels/attention/test_flash_attn.py    |  17 +-
 tests/kernels/attention/test_flashinfer.py    |  17 +-
 .../test_flashinfer_trtllm_attention.py       |  13 +-
 .../attention/test_merge_attn_states.py       |   3 +-
 .../test_triton_unified_attention.py          |  11 +-
 .../core/test_fused_quant_layernorm.py        |  31 +-
 tests/kernels/core/test_pos_encoding.py       |   4 +-
 tests/kernels/core/test_rotary_embedding.py   |   4 +-
 tests/kernels/mamba/test_causal_conv1d.py     |  21 +-
 .../moe/modular_kernel_tools/common.py        |  34 +-
 .../make_feature_matrix.py                    |   5 +-
 .../moe/modular_kernel_tools/mk_objects.py    |  23 +-
 .../modular_kernel_tools/parallel_utils.py    |  11 +-
 .../profile_modular_kernel.py                 |   3 +-
 tests/kernels/moe/parallel_utils.py           |  21 +-
 tests/kernels/moe/test_batched_moe.py         |   7 +-
 .../moe/test_count_expert_num_tokens.py       |   3 +-
 tests/kernels/moe/test_cutlass_moe.py         |  23 +-
 tests/kernels/moe/test_deepep_deepgemm_moe.py |  13 +-
 tests/kernels/moe/test_deepep_moe.py          |  39 +-
 .../moe/test_modular_kernel_combinations.py   |  10 +-
 tests/kernels/moe/test_moe.py                 |   8 +-
 .../kernels/moe/test_moe_align_block_size.py  |   4 +-
 .../kernels/moe/test_moe_permute_unpermute.py |   8 +-
 tests/kernels/moe/test_ocp_mx_moe.py          |  13 +-
 tests/kernels/moe/test_pplx_cutlass_moe.py    |   3 +-
 tests/kernels/moe/test_pplx_moe.py            |  64 +-
 tests/kernels/moe/utils.py                    |  89 ++-
 tests/kernels/quant_utils.py                  |  29 +-
 .../kernels/quantization/test_cutlass_w4a8.py |  21 +-
 tests/kernels/quantization/test_machete_mm.py |  41 +-
 .../quantization/test_triton_scaled_mm.py     |   3 +-
 tests/kernels/test_onednn.py                  |   6 +-
 tests/kernels/utils.py                        | 110 ++-
 tests/lora/test_layers.py                     |   7 +-
 tests/lora/test_llama_tp.py                   |   7 +-
 tests/lora/test_qwen2vl.py                    |   7 +-
 tests/lora/test_resolver.py                   |   3 +-
 tests/lora/test_utils.py                      |   4 +-
 tests/lora/utils.py                           |   5 +-
 .../tensorizer_loader/conftest.py             |   2 +-
 .../model_executor/test_enabled_custom_ops.py |   3 +-
 .../models/language/generation/test_common.py |   3 +-
 .../models/language/generation/test_hybrid.py |   2 +-
 .../language/generation_ppl_test/ppl_utils.py |   4 +-
 tests/models/language/pooling/embed_utils.py  |   3 +-
 .../models/language/pooling/test_embedding.py |   3 +-
 tests/models/language/pooling/test_gritlm.py  |   2 -
 .../language/pooling_mteb_test/mteb_utils.py  |   7 +-
 .../test_bge_reranker_v2_gemma.py             |   4 +-
 .../generation/test_granite_speech.py         |   7 +-
 .../generation/test_phi4_multimodal.py        |   5 +-
 .../multimodal/generation/test_phi4mm.py      |   7 +-
 .../multimodal/generation/test_pixtral.py     |   4 +-
 .../multimodal/generation/test_qwen2_vl.py    |   6 +-
 .../multimodal/generation/test_whisper.py     |   3 +-
 .../generation/vlm_utils/builders.py          |  17 +-
 .../multimodal/generation/vlm_utils/core.py   |  21 +-
 .../generation/vlm_utils/custom_inputs.py     |   2 +-
 .../generation/vlm_utils/model_utils.py       |  21 +-
 .../multimodal/generation/vlm_utils/types.py  |  60 +-
 .../multimodal/pooling/test_dse_qwen2_vl.py   |   2 +-
 .../pooling/test_jinavl_reranker.py           |   5 +-
 .../multimodal/processing/test_common.py      |   5 +-
 .../multimodal/processing/test_h2ovl.py       |   3 +-
 .../multimodal/processing/test_internvl.py    |   3 +-
 .../multimodal/processing/test_nemotron_vl.py |   3 +-
 .../processing/test_tensor_schema.py          |  16 +-
 tests/models/quantization/test_awq.py         |   3 +-
 tests/models/registry.py                      |  20 +-
 tests/models/test_transformers.py             |   8 +-
 tests/models/utils.py                         |  38 +-
 tests/multimodal/test_cache.py                |   7 +-
 tests/multimodal/test_processing.py           |   4 +-
 .../prithvi_io_processor/prithvi_processor.py |  10 +-
 .../prithvi_io_processor/types.py             |   6 +-
 .../my_gemma_embedding.py                     |   7 +-
 .../vllm_add_dummy_model/my_llava.py          |   3 +-
 .../vllm_add_dummy_model/my_opt.py            |   3 +-
 .../vllm_add_dummy_platform/__init__.py       |   4 +-
 tests/quantization/test_blackwell_moe.py      |   3 +-
 tests/quantization/test_compressed_tensors.py |   4 +-
 tests/quantization/test_quark.py              |   5 +-
 .../test_register_quantization_config.py      |  16 +-
 tests/reasoning/utils.py                      |  17 +-
 tests/samplers/test_no_bad_words.py           |   8 +-
 tests/tokenization/test_detokenize.py         |   4 +-
 tests/tokenization/test_tokenizer_registry.py |  18 +-
 tests/tool_use/mistral/utils.py               |   7 +-
 tests/tool_use/test_jamba_tool_parser.py      |   3 +-
 tests/tool_use/test_parallel_tool_calls.py    |   3 +-
 tests/tool_use/test_qwen3coder_tool_parser.py |   3 +-
 tests/tool_use/test_seed_oss_tool_parser.py   |   3 +-
 tests/tool_use/test_tool_calls.py             |   7 +-
 tests/tool_use/test_xlam_tool_parser.py       |   3 +-
 tests/tool_use/utils.py                       |  10 +-
 .../test_config_parser_registry.py            |   7 +-
 tests/utils.py                                |  47 +-
 tests/v1/attention/test_attention_backends.py |   5 +-
 tests/v1/attention/test_mla_backends.py       |   6 +-
 tests/v1/attention/utils.py                   |   7 +-
 tests/v1/core/test_kv_cache_utils.py          |   8 +-
 tests/v1/core/test_prefix_caching.py          |  10 +-
 tests/v1/core/test_scheduler.py               |  23 +-
 tests/v1/core/utils.py                        |  17 +-
 tests/v1/distributed/test_async_llm_dp.py     |  11 +-
 tests/v1/distributed/test_internal_lb_dp.py   |   6 +-
 tests/v1/e2e/test_min_tokens.py               |   8 +-
 tests/v1/e2e/test_spec_decode.py              |   6 +-
 tests/v1/engine/test_async_llm.py             |   9 +-
 tests/v1/engine/test_engine_core_client.py    |  12 +-
 tests/v1/engine/test_llm_engine.py            |   4 +-
 tests/v1/engine/test_output_processor.py      |  17 +-
 tests/v1/engine/utils.py                      |  15 +-
 .../llm/test_struct_output_generate.py        |   4 +-
 .../v1/entrypoints/openai/test_completion.py  |   3 +-
 tests/v1/executor/test_executor.py            |  11 +-
 .../unit/test_kv_load_failure_recovery.py     |   2 +-
 .../kv_connector/unit/test_nixl_connector.py  |   5 +-
 .../unit/test_output_aggreagator.py           |   7 +-
 tests/v1/kv_connector/unit/utils.py           |  13 +-
 tests/v1/kv_offload/test_cpu_manager.py       |   3 +-
 .../v1/logits_processors/test_correctness.py  |   8 +-
 .../logits_processors/test_custom_offline.py  |   4 +-
 .../logits_processors/test_custom_online.py   |   6 +-
 tests/v1/logits_processors/utils.py           |   8 +-
 tests/v1/sample/test_rejection_sampler.py     |  28 +-
 tests/v1/sample/utils.py                      |   8 +-
 tests/v1/spec_decode/test_eagle.py            |   3 +-
 tests/v1/spec_decode/test_tree_attention.py   |   3 +-
 tests/v1/test_serial_utils.py                 |   3 +-
 tests/v1/tpu/test_basic.py                    |   4 +-
 tests/v1/tpu/test_perf.py                     |   4 +-
 tests/v1/tracing/test_tracing.py              |   2 -
 tests/v1/worker/test_gpu_input_batch.py       |   3 +-
 .../v1/worker/test_worker_memory_snapshot.py  |   5 +-
 .../vllm_test_utils/vllm_test_utils/blame.py  |   3 +-
 .../vllm_test_utils/monitor.py                |   4 +-
 tools/check_init_lazy_imports.py              |   2 -
 tools/enforce_regex_import.py                 |   2 -
 tools/pre_commit/mypy.py                      |   5 +-
 tools/profiler/visualize_layerwise_profile.py |   4 +-
 vllm/_bc_linter.py                            |   5 +-
 vllm/_custom_ops.py                           | 208 +++---
 vllm/_ipex_ops.py                             |  39 +-
 vllm/assets/base.py                           |   3 +-
 vllm/assets/video.py                          |   4 +-
 vllm/attention/backends/abstract.py           |  40 +-
 vllm/attention/backends/registry.py           |   5 +-
 vllm/attention/backends/utils.py              |   3 +-
 vllm/attention/layer.py                       |  53 +-
 .../layers/chunked_local_attention.py         |  12 +-
 vllm/attention/layers/cross_attention.py      |   5 +-
 .../layers/encoder_only_attention.py          |   5 +-
 vllm/attention/ops/flashmla.py                |  19 +-
 vllm/attention/ops/merge_attn_states.py       |   3 +-
 vllm/attention/ops/paged_attn.py              |  11 +-
 vllm/attention/ops/rocm_aiter_mla.py          |  19 +-
 vllm/attention/ops/rocm_aiter_paged_attn.py   |   3 +-
 .../attention/ops/triton_merge_attn_states.py |   3 +-
 vllm/attention/selector.py                    |  19 +-
 vllm/attention/utils/fa_utils.py              |   3 +-
 vllm/beam_search.py                           |  16 +-
 vllm/benchmarks/datasets.py                   |  66 +-
 vllm/benchmarks/latency.py                    |   4 +-
 vllm/benchmarks/lib/endpoint_request_func.py  |  40 +-
 vllm/benchmarks/serve.py                      |  30 +-
 vllm/benchmarks/throughput.py                 |  14 +-
 vllm/compilation/backends.py                  |  17 +-
 vllm/compilation/base_static_graph.py         |   3 +-
 vllm/compilation/collective_fusion.py         |  19 +-
 vllm/compilation/compiler_interface.py        |  33 +-
 vllm/compilation/cuda_graph.py                |  11 +-
 vllm/compilation/decorators.py                |  23 +-
 vllm/compilation/fix_functionalization.py     |  11 +-
 vllm/compilation/fx_utils.py                  |   9 +-
 vllm/compilation/inductor_pass.py             |  15 +-
 vllm/compilation/noop_elimination.py          |   7 +-
 vllm/compilation/partition_rules.py           |   6 +-
 vllm/compilation/piecewise_backend.py         |   3 +-
 vllm/compilation/sequence_parallelism.py      |   7 +-
 vllm/compilation/torch25_custom_graph_pass.py |   4 +-
 vllm/compilation/vllm_inductor_pass.py        |   4 +-
 vllm/compilation/wrapper.py                   |   4 +-
 vllm/config/cache.py                          |  20 +-
 vllm/config/compilation.py                    |  19 +-
 vllm/config/device.py                         |   4 +-
 vllm/config/kv_events.py                      |   3 +-
 vllm/config/kv_transfer.py                    |  14 +-
 vllm/config/load.py                           |  22 +-
 vllm/config/lora.py                           |   8 +-
 vllm/config/model.py                          | 134 ++--
 vllm/config/multimodal.py                     |  28 +-
 vllm/config/observability.py                  |   8 +-
 vllm/config/parallel.py                       |  26 +-
 vllm/config/pooler.py                         |  22 +-
 vllm/config/scheduler.py                      |   4 +-
 vllm/config/speculative.py                    |  30 +-
 vllm/config/speech_to_text.py                 |   3 +-
 vllm/config/vllm.py                           |  32 +-
 vllm/connections.py                           |  29 +-
 vllm/device_allocator/cumem.py                |  13 +-
 vllm/distributed/communication_op.py          |   6 +-
 .../device_communicators/all2all.py           |   4 +-
 .../device_communicators/all_reduce_utils.py  |   8 +-
 .../base_device_communicator.py               |  23 +-
 .../device_communicators/cpu_communicator.py  |  24 +-
 .../device_communicators/cuda_communicator.py |  25 +-
 .../device_communicators/cuda_wrapper.py      |   6 +-
 .../device_communicators/custom_all_reduce.py |  13 +-
 .../device_communicators/pynccl.py            |   7 +-
 .../device_communicators/pynccl_allocator.py  |   4 +-
 .../device_communicators/pynccl_wrapper.py    |   4 +-
 .../device_communicators/quick_all_reduce.py  |   5 +-
 .../device_communicators/ray_communicator.py  |  12 +-
 .../device_communicators/shm_broadcast.py     |  30 +-
 .../shm_object_storage.py                     |  18 +-
 .../device_communicators/symm_mem.py          |  11 +-
 .../device_communicators/tpu_communicator.py  |   5 +-
 .../device_communicators/xpu_communicator.py  |   7 +-
 vllm/distributed/eplb/eplb_state.py           |  15 +-
 vllm/distributed/eplb/rebalance_execute.py    |   3 +-
 vllm/distributed/kv_events.py                 |  29 +-
 .../kv_transfer/kv_connector/factory.py       |   3 +-
 .../kv_transfer/kv_connector/utils.py         |  10 +-
 .../kv_transfer/kv_connector/v1/base.py       |  18 +-
 .../kv_connector/v1/lmcache_connector.py      |   8 +-
 .../kv_transfer/kv_connector/v1/metrics.py    |   6 +-
 .../kv_connector/v1/multi_connector.py        |  22 +-
 .../kv_connector/v1/nixl_connector.py         |  32 +-
 .../kv_connector/v1/offloading_connector.py   |  12 +-
 .../kv_connector/v1/p2p/p2p_nccl_connector.py |   6 +-
 .../kv_connector/v1/p2p/p2p_nccl_engine.py    |  13 +-
 .../v1/shared_storage_connector.py            |   4 +-
 .../kv_transfer/kv_lookup_buffer/base.py      |   9 +-
 .../kv_lookup_buffer/mooncake_store.py        |   7 +-
 .../kv_lookup_buffer/simple_buffer.py         |  11 +-
 vllm/distributed/kv_transfer/kv_pipe/base.py  |   5 +-
 .../kv_transfer/kv_pipe/mooncake_pipe.py      |  13 +-
 .../kv_transfer/kv_pipe/pynccl_pipe.py        |  20 +-
 .../kv_transfer/kv_transfer_state.py          |   6 +-
 vllm/distributed/parallel_state.py            |  87 ++-
 vllm/distributed/tpu_distributed_utils.py     |   6 +-
 vllm/distributed/utils.py                     |   6 +-
 vllm/engine/arg_utils.py                      | 139 ++--
 vllm/engine/metrics.py                        |  26 +-
 vllm/engine/protocol.py                       |  32 +-
 vllm/entrypoints/api_server.py                |   6 +-
 vllm/entrypoints/chat_utils.py                | 211 +++---
 vllm/entrypoints/cli/benchmark/main.py        |   4 +-
 vllm/entrypoints/cli/collect_env.py           |   4 +-
 vllm/entrypoints/cli/main.py                  |   2 -
 vllm/entrypoints/cli/openai.py                |   4 +-
 vllm/entrypoints/cli/run_batch.py             |   4 +-
 vllm/entrypoints/cli/serve.py                 |   3 +-
 vllm/entrypoints/cli/types.py                 |   4 +-
 vllm/entrypoints/context.py                   |  16 +-
 vllm/entrypoints/harmony_utils.py             |  12 +-
 vllm/entrypoints/launcher.py                  |   4 +-
 vllm/entrypoints/llm.py                       | 234 +++---
 vllm/entrypoints/logger.py                    |  17 +-
 vllm/entrypoints/openai/api_server.py         |  40 +-
 vllm/entrypoints/openai/cli_args.py           |  32 +-
 vllm/entrypoints/openai/logits_processors.py  |   9 +-
 vllm/entrypoints/openai/protocol.py           | 678 +++++++++---------
 vllm/entrypoints/openai/run_batch.py          |   5 +-
 vllm/entrypoints/openai/serving_chat.py       |  44 +-
 .../openai/serving_classification.py          |  12 +-
 vllm/entrypoints/openai/serving_completion.py |  32 +-
 vllm/entrypoints/openai/serving_embedding.py  |  36 +-
 vllm/entrypoints/openai/serving_engine.py     | 188 +++--
 vllm/entrypoints/openai/serving_models.py     |  19 +-
 vllm/entrypoints/openai/serving_pooling.py    |  14 +-
 vllm/entrypoints/openai/serving_responses.py  |  72 +-
 vllm/entrypoints/openai/serving_score.py      |  58 +-
 .../openai/serving_tokenization.py            |  14 +-
 .../openai/serving_transcription.py           |   9 +-
 vllm/entrypoints/openai/speech_to_text.py     |  25 +-
 .../tool_parsers/abstract_tool_parser.py      |  13 +-
 .../tool_parsers/deepseekv31_tool_parser.py   |   5 +-
 .../tool_parsers/deepseekv3_tool_parser.py    |   5 +-
 .../tool_parsers/glm4_moe_tool_parser.py      |   6 +-
 .../granite_20b_fc_tool_parser.py             |   3 +-
 .../tool_parsers/granite_tool_parser.py       |   3 +-
 .../openai/tool_parsers/hermes_tool_parser.py |   5 +-
 .../tool_parsers/hunyuan_a13b_tool_parser.py  |   6 +-
 .../tool_parsers/internlm2_tool_parser.py     |   3 +-
 .../openai/tool_parsers/jamba_tool_parser.py  |   5 +-
 .../tool_parsers/kimi_k2_tool_parser.py       |   5 +-
 .../llama4_pythonic_tool_parser.py            |   8 +-
 .../openai/tool_parsers/llama_tool_parser.py  |   3 +-
 .../tool_parsers/minimax_tool_parser.py       |  14 +-
 .../tool_parsers/mistral_tool_parser.py       |   5 +-
 .../openai/tool_parsers/openai_tool_parser.py |   6 +-
 .../tool_parsers/phi4mini_tool_parser.py      |   4 +-
 .../tool_parsers/pythonic_tool_parser.py      |   8 +-
 .../tool_parsers/qwen3coder_tool_parser.py    |  12 +-
 .../tool_parsers/qwen3xml_tool_parser.py      |  18 +-
 .../tool_parsers/seed_oss_tool_parser.py      |   8 +-
 .../openai/tool_parsers/step3_tool_parser.py  |   6 +-
 vllm/entrypoints/renderer.py                  |  81 +--
 vllm/entrypoints/score_utils.py               |  34 +-
 vllm/entrypoints/ssl.py                       |   8 +-
 vllm/entrypoints/tool_server.py               |  12 +-
 vllm/entrypoints/utils.py                     |  12 +-
 vllm/envs.py                                  |  85 +--
 vllm/executor/executor_base.py                |  38 +-
 vllm/executor/ray_distributed_executor.py     |  21 +-
 vllm/executor/ray_utils.py                    |   6 +-
 vllm/executor/uniproc_executor.py             |  11 +-
 vllm/forward_context.py                       |  32 +-
 vllm/inputs/data.py                           |  50 +-
 vllm/inputs/parse.py                          |  20 +-
 vllm/inputs/preprocess.py                     |  86 ++-
 vllm/logger.py                                |   4 +-
 vllm/logging_utils/dump_input.py              |   5 +-
 vllm/logits_process.py                        |  12 +-
 vllm/logprobs.py                              |   7 +-
 vllm/lora/layers/base.py                      |  16 +-
 vllm/lora/layers/base_linear.py               |  13 +-
 vllm/lora/layers/column_parallel_linear.py    |  63 +-
 vllm/lora/layers/logits_processor.py          |  15 +-
 vllm/lora/layers/replicated_linear.py         |   5 +-
 vllm/lora/layers/row_parallel_linear.py       |  15 +-
 vllm/lora/layers/vocal_parallel_embedding.py  |  13 +-
 vllm/lora/lora_weights.py                     |  16 +-
 vllm/lora/models.py                           |  49 +-
 .../ops/triton_ops/lora_kernel_metadata.py    |   3 +-
 vllm/lora/peft_helper.py                      |  12 +-
 vllm/lora/punica_wrapper/punica_base.py       |  48 +-
 vllm/lora/punica_wrapper/punica_cpu.py        |  12 +-
 vllm/lora/punica_wrapper/punica_gpu.py        |  10 +-
 vllm/lora/punica_wrapper/punica_tpu.py        |  16 +-
 vllm/lora/punica_wrapper/punica_xpu.py        |  10 +-
 vllm/lora/punica_wrapper/utils.py             |   6 +-
 vllm/lora/request.py                          |   9 +-
 vllm/lora/resolver.py                         |   3 +-
 vllm/lora/utils.py                            |   8 +-
 vllm/lora/worker_manager.py                   |   6 +-
 vllm/model_executor/custom_op.py              |   3 +-
 vllm/model_executor/layers/activation.py      |   3 +-
 vllm/model_executor/layers/batch_invariant.py |  10 +-
 vllm/model_executor/layers/fla/ops/chunk.py   |   7 +-
 .../layers/fla/ops/chunk_delta_h.py           |   7 +-
 vllm/model_executor/layers/fla/ops/chunk_o.py |   7 +-
 .../layers/fla/ops/chunk_scaled_dot_kkt.py    |   5 +-
 vllm/model_executor/layers/fla/ops/cumsum.py  |  13 +-
 .../layers/fla/ops/fused_recurrent.py         |  19 +-
 vllm/model_executor/layers/fla/ops/l2norm.py  |   3 +-
 .../layers/fla/ops/layernorm_guard.py         |  13 +-
 .../layers/fla/ops/solve_tril.py              |   3 +-
 vllm/model_executor/layers/fla/ops/utils.py   |   5 +-
 vllm/model_executor/layers/fla/ops/wy_fast.py |   3 +-
 .../layers/fused_moe/__init__.py              |   6 +-
 .../layers/fused_moe/batched_deep_gemm_moe.py |  11 +-
 .../batched_triton_or_deep_gemm_moe.py        |  11 +-
 .../model_executor/layers/fused_moe/config.py | 128 ++--
 .../layers/fused_moe/cpu_fused_moe.py         |  42 +-
 .../layers/fused_moe/cutlass_moe.py           |  54 +-
 .../layers/fused_moe/deep_gemm_moe.py         |  19 +-
 .../layers/fused_moe/deep_gemm_utils.py       |  15 +-
 .../fused_moe/deepep_ht_prepare_finalize.py   |  28 +-
 .../fused_moe/deepep_ll_prepare_finalize.py   |  20 +-
 .../fused_moe/flashinfer_cutlass_moe.py       |  21 +-
 .../flashinfer_cutlass_prepare_finalize.py    |   9 +-
 .../layers/fused_moe/flashinfer_trtllm_moe.py |  13 +-
 .../layers/fused_moe/fused_batched_moe.py     |  42 +-
 .../layers/fused_moe/fused_marlin_moe.py      |  74 +-
 .../layers/fused_moe/fused_moe.py             | 169 ++---
 .../fused_moe/gpt_oss_triton_kernels_moe.py   |  21 +-
 vllm/model_executor/layers/fused_moe/layer.py | 216 +++---
 .../layers/fused_moe/modular_kernel.py        |  99 +--
 .../layers/fused_moe/moe_align_block_size.py  |   3 +-
 .../layers/fused_moe/moe_permute_unpermute.py |  27 +-
 .../layers/fused_moe/pplx_prepare_finalize.py |  26 +-
 .../layers/fused_moe/prepare_finalize.py      |   7 +-
 .../layers/fused_moe/rocm_aiter_fused_moe.py  |  51 +-
 .../layers/fused_moe/routing_simulator.py     |   8 +-
 .../layers/fused_moe/shared_fused_moe.py      |   5 +-
 .../fused_moe/topk_weight_and_reduce.py       |   9 +-
 .../layers/fused_moe/triton_deep_gemm_moe.py  |  11 +-
 .../layers/fused_moe/trtllm_moe.py            |  11 +-
 vllm/model_executor/layers/fused_moe/utils.py |  47 +-
 vllm/model_executor/layers/layernorm.py       |  34 +-
 vllm/model_executor/layers/lightning_attn.py  |   3 +-
 vllm/model_executor/layers/linear.py          |  50 +-
 .../model_executor/layers/logits_processor.py |  14 +-
 .../layers/mamba/linear_attn.py               |  14 +-
 .../layers/mamba/mamba_mixer.py               |  14 +-
 .../layers/mamba/mamba_mixer2.py              |  20 +-
 .../layers/mamba/mamba_utils.py               |  13 +-
 .../layers/mamba/ops/causal_conv1d.py         |  31 +-
 .../model_executor/layers/mamba/short_conv.py |   6 +-
 vllm/model_executor/layers/mla.py             |  21 +-
 vllm/model_executor/layers/pooler.py          |  54 +-
 .../layers/quantization/auto_round.py         |   6 +-
 .../model_executor/layers/quantization/awq.py |   8 +-
 .../layers/quantization/awq_marlin.py         |  29 +-
 .../layers/quantization/base_config.py        |   8 +-
 .../layers/quantization/bitblas.py            |  12 +-
 .../layers/quantization/bitsandbytes.py       |  33 +-
 .../compressed_tensors/compressed_tensors.py  |  26 +-
 .../compressed_tensors_moe.py                 | 126 ++--
 .../schemes/compressed_tensors_24.py          |  11 +-
 .../schemes/compressed_tensors_scheme.py      |   3 +-
 .../schemes/compressed_tensors_w4a16_24.py    |   6 +-
 .../schemes/compressed_tensors_w4a16_nvfp4.py |   4 +-
 .../schemes/compressed_tensors_w4a4_nvfp4.py  |   4 +-
 .../schemes/compressed_tensors_w4a8_fp8.py    |  10 +-
 .../schemes/compressed_tensors_w4a8_int.py    |   6 +-
 .../schemes/compressed_tensors_w8a16_fp8.py   |   4 +-
 .../schemes/compressed_tensors_w8a8_fp8.py    |   4 +-
 .../schemes/compressed_tensors_w8a8_int8.py   |   4 +-
 .../schemes/compressed_tensors_wNa16.py       |  10 +-
 .../compressed_tensors/transform/linear.py    |  15 +-
 .../compressed_tensors/transform/module.py    |   3 +-
 .../transform/schemes/linear_qutlass_nvfp4.py |   5 +-
 .../compressed_tensors/triton_scaled_mm.py    |   3 +-
 .../quantization/compressed_tensors/utils.py  |  11 +-
 .../layers/quantization/deepspeedfp.py        |   2 +-
 .../layers/quantization/experts_int8.py       |  23 +-
 .../layers/quantization/fbgemm_fp8.py         |   2 +-
 .../model_executor/layers/quantization/fp8.py |  37 +-
 .../layers/quantization/fp_quant.py           |  10 +-
 .../layers/quantization/gguf.py               |  31 +-
 .../layers/quantization/gptq.py               |  12 +-
 .../layers/quantization/gptq_bitblas.py       |   6 +-
 .../layers/quantization/gptq_marlin.py        |  33 +-
 .../layers/quantization/gptq_marlin_24.py     |   4 +-
 .../layers/quantization/hqq_marlin.py         |   4 +-
 .../layers/quantization/input_quant_fp8.py    |  13 +-
 .../layers/quantization/ipex_quant.py         |  35 +-
 .../kernels/mixed_precision/MPLinearKernel.py |  18 +-
 .../kernels/mixed_precision/__init__.py       |   4 +-
 .../kernels/mixed_precision/allspark.py       |   5 +-
 .../kernels/mixed_precision/bitblas.py        |  13 +-
 .../kernels/mixed_precision/conch.py          |   6 +-
 .../kernels/mixed_precision/cutlass.py        |   5 +-
 .../kernels/mixed_precision/dynamic_4bit.py   |   5 +-
 .../kernels/mixed_precision/exllama.py        |   5 +-
 .../kernels/mixed_precision/machete.py        |   5 +-
 .../kernels/mixed_precision/marlin.py         |   5 +-
 .../kernels/scaled_mm/ScaledMMLinearKernel.py |  11 +-
 .../kernels/scaled_mm/__init__.py             |   3 +-
 .../quantization/kernels/scaled_mm/aiter.py   |   9 +-
 .../quantization/kernels/scaled_mm/cpu.py     |   9 +-
 .../quantization/kernels/scaled_mm/cutlass.py |   5 +-
 .../quantization/kernels/scaled_mm/triton.py  |   5 +-
 .../quantization/kernels/scaled_mm/xla.py     |   9 +-
 .../layers/quantization/modelopt.py           |  65 +-
 .../layers/quantization/moe_wna16.py          |  27 +-
 .../layers/quantization/mxfp4.py              |  41 +-
 .../layers/quantization/petit.py              |  10 +-
 .../layers/quantization/ptpc_fp8.py           |   4 +-
 .../layers/quantization/quark/quark.py        |  22 +-
 .../layers/quantization/quark/quark_moe.py    |  43 +-
 .../quark/schemes/quark_ocp_mx.py             |  13 +-
 .../quark/schemes/quark_scheme.py             |   3 +-
 .../quark/schemes/quark_w8a8_fp8.py           |   9 +-
 .../quark/schemes/quark_w8a8_int8.py          |   8 +-
 .../layers/quantization/quark/utils.py        |   4 +-
 .../model_executor/layers/quantization/rtn.py |  25 +-
 .../layers/quantization/schema.py             |   4 +-
 .../layers/quantization/torchao.py            |   4 +-
 .../layers/quantization/tpu_int8.py           |   2 +-
 .../quantization/utils/bitblas_utils.py       |  13 +-
 .../quantization/utils/flashinfer_fp4_moe.py  |   2 -
 .../quantization/utils/flashinfer_utils.py    |  15 +-
 .../layers/quantization/utils/fp8_utils.py    |  40 +-
 .../layers/quantization/utils/gptq_utils.py   |  14 +-
 .../layers/quantization/utils/int8_utils.py   |   8 +-
 .../layers/quantization/utils/layer_utils.py  |   3 +-
 .../quantization/utils/machete_utils.py       |   3 +-
 .../layers/quantization/utils/marlin_utils.py |  19 +-
 .../quantization/utils/marlin_utils_fp4.py    |   5 +-
 .../quantization/utils/marlin_utils_fp8.py    |   3 +-
 .../quantization/utils/marlin_utils_test.py   |   4 +-
 .../layers/quantization/utils/mxfp4_utils.py  |  19 +-
 .../layers/quantization/utils/ocp_mx_utils.py |   5 +-
 .../layers/quantization/utils/petit_utils.py  |   8 +-
 .../layers/quantization/utils/quant_utils.py  |  12 +-
 .../layers/quantization/utils/w8a8_utils.py   |  18 +-
 vllm/model_executor/layers/resampler.py       |  20 +-
 .../layers/rotary_embedding/__init__.py       |   8 +-
 .../layers/rotary_embedding/base.py           |  18 +-
 .../layers/rotary_embedding/common.py         |   4 +-
 .../rotary_embedding/deepseek_scaling_rope.py |  13 +-
 .../rotary_embedding/dual_chunk_rope.py       |   5 +-
 .../rotary_embedding/ernie45_vl_rope.py       |   9 +-
 .../rotary_embedding/linear_scaling_rope.py   |   3 +-
 .../rotary_embedding/llama4_vision_rope.py    |   9 +-
 .../layers/rotary_embedding/mrope.py          |  39 +-
 .../rotary_embedding/ntk_scaling_rope.py      |   3 +-
 .../phi3_long_rope_scaled_rope.py             |  11 +-
 vllm/model_executor/layers/utils.py           |  12 +-
 .../layers/vocab_parallel_embedding.py        |  17 +-
 vllm/model_executor/model_loader/__init__.py  |   4 +-
 .../model_loader/bitsandbytes_loader.py       |  10 +-
 .../model_loader/default_loader.py            |  10 +-
 .../model_loader/runai_streamer_loader.py     |   3 +-
 .../model_loader/sharded_state_loader.py      |   8 +-
 .../model_executor/model_loader/tensorizer.py |  48 +-
 .../model_loader/tensorizer_loader.py         |   3 +-
 vllm/model_executor/model_loader/tpu.py       |   5 +-
 vllm/model_executor/model_loader/utils.py     |   7 +-
 .../model_loader/weight_utils.py              |  36 +-
 vllm/model_executor/models/adapters.py        |   8 +-
 vllm/model_executor/models/aimv2.py           |   5 +-
 vllm/model_executor/models/apertus.py         |  38 +-
 vllm/model_executor/models/arcee.py           |  28 +-
 vllm/model_executor/models/arctic.py          |  31 +-
 vllm/model_executor/models/aria.py            |  28 +-
 vllm/model_executor/models/aya_vision.py      |  20 +-
 vllm/model_executor/models/baichuan.py        |  27 +-
 vllm/model_executor/models/bailing_moe.py     |  33 +-
 vllm/model_executor/models/bamba.py           |  29 +-
 vllm/model_executor/models/bert.py            |  49 +-
 vllm/model_executor/models/bert_with_rope.py  |  37 +-
 vllm/model_executor/models/blip.py            |  21 +-
 vllm/model_executor/models/blip2.py           |  42 +-
 vllm/model_executor/models/bloom.py           |  25 +-
 vllm/model_executor/models/chameleon.py       |  48 +-
 vllm/model_executor/models/chatglm.py         |  31 +-
 vllm/model_executor/models/clip.py            |  88 +--
 vllm/model_executor/models/cohere2_vision.py  |  18 +-
 vllm/model_executor/models/commandr.py        |  33 +-
 vllm/model_executor/models/dbrx.py            |  37 +-
 vllm/model_executor/models/deepseek.py        |  32 +-
 vllm/model_executor/models/deepseek_eagle.py  |   5 +-
 vllm/model_executor/models/deepseek_mtp.py    |  13 +-
 vllm/model_executor/models/deepseek_v2.py     |  70 +-
 vllm/model_executor/models/deepseek_vl2.py    |  32 +-
 vllm/model_executor/models/dots1.py           |  32 +-
 vllm/model_executor/models/dots_ocr.py        |  44 +-
 vllm/model_executor/models/ernie45_moe.py     |  34 +-
 vllm/model_executor/models/ernie45_vl.py      |  68 +-
 vllm/model_executor/models/ernie45_vl_moe.py  |  38 +-
 vllm/model_executor/models/ernie_mtp.py       |   9 +-
 vllm/model_executor/models/exaone.py          |  38 +-
 vllm/model_executor/models/exaone4.py         |  32 +-
 vllm/model_executor/models/falcon.py          |  26 +-
 vllm/model_executor/models/falcon_h1.py       |  33 +-
 vllm/model_executor/models/flex_olmo.py       |   6 +-
 vllm/model_executor/models/fuyu.py            |  16 +-
 vllm/model_executor/models/gemma.py           |  35 +-
 vllm/model_executor/models/gemma2.py          |  31 +-
 vllm/model_executor/models/gemma3.py          |  31 +-
 vllm/model_executor/models/gemma3_mm.py       |  22 +-
 vllm/model_executor/models/gemma3n.py         |  47 +-
 vllm/model_executor/models/glm4.py            |  21 +-
 vllm/model_executor/models/glm4_1v.py         |  52 +-
 vllm/model_executor/models/glm4_moe.py        |  36 +-
 vllm/model_executor/models/glm4_moe_mtp.py    |  17 +-
 vllm/model_executor/models/glm4v.py           |  44 +-
 vllm/model_executor/models/gpt2.py            |  29 +-
 vllm/model_executor/models/gpt_bigcode.py     |  25 +-
 vllm/model_executor/models/gpt_j.py           |  25 +-
 vllm/model_executor/models/gpt_neox.py        |  25 +-
 vllm/model_executor/models/gpt_oss.py         |  15 +-
 vllm/model_executor/models/granite.py         |  30 +-
 vllm/model_executor/models/granite_speech.py  |  28 +-
 vllm/model_executor/models/granitemoe.py      |  26 +-
 .../model_executor/models/granitemoehybrid.py |  33 +-
 .../model_executor/models/granitemoeshared.py |  17 +-
 vllm/model_executor/models/gritlm.py          |  13 +-
 vllm/model_executor/models/grok1.py           |  31 +-
 vllm/model_executor/models/h2ovl.py           |  53 +-
 vllm/model_executor/models/hunyuan_v1.py      |  46 +-
 .../models/hyperclovax_vision.py              |  50 +-
 .../models/idefics2_vision_model.py           |  21 +-
 vllm/model_executor/models/idefics3.py        |  46 +-
 vllm/model_executor/models/interfaces.py      | 132 ++--
 vllm/model_executor/models/interfaces_base.py |  26 +-
 vllm/model_executor/models/intern_vit.py      |  25 +-
 vllm/model_executor/models/internlm2.py       |  34 +-
 vllm/model_executor/models/internlm2_ve.py    |  17 +-
 vllm/model_executor/models/interns1.py        |  42 +-
 vllm/model_executor/models/interns1_vit.py    |  21 +-
 vllm/model_executor/models/internvl.py        | 122 ++--
 vllm/model_executor/models/jais.py            |  25 +-
 vllm/model_executor/models/jamba.py           |  39 +-
 vllm/model_executor/models/jina_vl.py         |   9 +-
 vllm/model_executor/models/keye.py            | 176 ++---
 vllm/model_executor/models/keye_vl1_5.py      |  42 +-
 vllm/model_executor/models/kimi_vl.py         |  20 +-
 vllm/model_executor/models/lfm2.py            |  36 +-
 vllm/model_executor/models/lfm2_moe.py        |  36 +-
 vllm/model_executor/models/llama.py           |  38 +-
 vllm/model_executor/models/llama4.py          |  12 +-
 vllm/model_executor/models/llama4_eagle.py    |  11 +-
 vllm/model_executor/models/llama_eagle.py     |   5 +-
 vllm/model_executor/models/llama_eagle3.py    |  17 +-
 vllm/model_executor/models/llava.py           |  56 +-
 vllm/model_executor/models/llava_next.py      |  32 +-
 .../model_executor/models/llava_next_video.py |  22 +-
 vllm/model_executor/models/llava_onevision.py |  46 +-
 vllm/model_executor/models/longcat_flash.py   |  27 +-
 .../models/longcat_flash_mtp.py               |  17 +-
 vllm/model_executor/models/mamba.py           |  19 +-
 vllm/model_executor/models/mamba2.py          |  17 +-
 vllm/model_executor/models/midashenglm.py     |  52 +-
 vllm/model_executor/models/mimo.py            |   9 +-
 vllm/model_executor/models/mimo_mtp.py        |  13 +-
 vllm/model_executor/models/minicpm.py         |  42 +-
 vllm/model_executor/models/minicpm3.py        |  12 +-
 vllm/model_executor/models/minicpm_eagle.py   |  15 +-
 vllm/model_executor/models/minicpmo.py        |  30 +-
 vllm/model_executor/models/minicpmv.py        |  74 +-
 vllm/model_executor/models/minimax_text_01.py |  36 +-
 vllm/model_executor/models/minimax_vl_01.py   |  38 +-
 vllm/model_executor/models/mistral3.py        |  32 +-
 vllm/model_executor/models/mixtral.py         |  33 +-
 vllm/model_executor/models/mllama4.py         |  34 +-
 vllm/model_executor/models/modernbert.py      |  27 +-
 vllm/model_executor/models/module_mapping.py  |   9 +-
 vllm/model_executor/models/molmo.py           |  80 +--
 vllm/model_executor/models/moonvit.py         |  15 +-
 vllm/model_executor/models/mpt.py             |  25 +-
 .../model_executor/models/nano_nemotron_vl.py |  90 +--
 vllm/model_executor/models/nemotron.py        |  36 +-
 vllm/model_executor/models/nemotron_h.py      |  43 +-
 vllm/model_executor/models/nemotron_nas.py    |  34 +-
 vllm/model_executor/models/nemotron_vl.py     |  37 +-
 vllm/model_executor/models/nvlm_d.py          |   7 +-
 vllm/model_executor/models/olmo.py            |  27 +-
 vllm/model_executor/models/olmo2.py           |  15 +-
 vllm/model_executor/models/olmoe.py           |  21 +-
 vllm/model_executor/models/opt.py             |  33 +-
 vllm/model_executor/models/orion.py           |  28 +-
 vllm/model_executor/models/ovis.py            |  22 +-
 vllm/model_executor/models/ovis2_5.py         |  26 +-
 vllm/model_executor/models/paligemma.py       |  26 +-
 vllm/model_executor/models/persimmon.py       |  23 +-
 vllm/model_executor/models/phi.py             |  25 +-
 vllm/model_executor/models/phi3v.py           |  34 +-
 vllm/model_executor/models/phi4_multimodal.py |  48 +-
 vllm/model_executor/models/phi4mm.py          |  32 +-
 vllm/model_executor/models/phi4mm_audio.py    |  76 +-
 vllm/model_executor/models/phi4mm_utils.py    |  37 +-
 vllm/model_executor/models/phimoe.py          |  33 +-
 vllm/model_executor/models/pixtral.py         |  62 +-
 vllm/model_executor/models/plamo2.py          |  18 +-
 vllm/model_executor/models/qwen.py            |  30 +-
 vllm/model_executor/models/qwen2.py           |  32 +-
 .../models/qwen2_5_omni_thinker.py            |  48 +-
 vllm/model_executor/models/qwen2_5_vl.py      |  64 +-
 vllm/model_executor/models/qwen2_audio.py     |  28 +-
 vllm/model_executor/models/qwen2_moe.py       |  43 +-
 vllm/model_executor/models/qwen2_rm.py        |   7 +-
 vllm/model_executor/models/qwen2_vl.py        |  76 +-
 vllm/model_executor/models/qwen3.py           |  26 +-
 vllm/model_executor/models/qwen3_moe.py       |  39 +-
 vllm/model_executor/models/qwen3_next.py      |  27 +-
 vllm/model_executor/models/qwen3_next_mtp.py  |  11 +-
 .../models/qwen3_omni_moe_thinker.py          |  60 +-
 vllm/model_executor/models/qwen3_vl.py        |  70 +-
 vllm/model_executor/models/qwen3_vl_moe.py    |  11 +-
 vllm/model_executor/models/qwen_vl.py         |  56 +-
 vllm/model_executor/models/radio.py           |  38 +-
 vllm/model_executor/models/registry.py        |  46 +-
 vllm/model_executor/models/roberta.py         |  17 +-
 vllm/model_executor/models/rvl.py             |   3 +-
 vllm/model_executor/models/seed_oss.py        |  29 +-
 vllm/model_executor/models/siglip.py          |  35 +-
 vllm/model_executor/models/siglip2navit.py    |  19 +-
 vllm/model_executor/models/skyworkr1v.py      |  76 +-
 vllm/model_executor/models/smolvlm.py         |   5 +-
 vllm/model_executor/models/solar.py           |  30 +-
 vllm/model_executor/models/stablelm.py        |  25 +-
 vllm/model_executor/models/starcoder2.py      |  25 +-
 vllm/model_executor/models/step3_text.py      |  28 +-
 vllm/model_executor/models/step3_vl.py        |  44 +-
 vllm/model_executor/models/swin.py            |  51 +-
 vllm/model_executor/models/tarsier.py         |  47 +-
 vllm/model_executor/models/terratorch.py      |  32 +-
 vllm/model_executor/models/transformers.py    |  46 +-
 .../models/transformers_pooling.py            |  10 +-
 vllm/model_executor/models/ultravox.py        |  36 +-
 vllm/model_executor/models/utils.py           |  32 +-
 vllm/model_executor/models/vision.py          |  26 +-
 vllm/model_executor/models/voxtral.py         |  38 +-
 vllm/model_executor/models/whisper.py         |  56 +-
 vllm/model_executor/models/zamba2.py          |  42 +-
 vllm/model_executor/parameter.py              |  25 +-
 vllm/model_executor/utils.py                  |   4 +-
 vllm/multimodal/audio.py                      |   4 +-
 vllm/multimodal/cache.py                      |  40 +-
 vllm/multimodal/evs.py                        |   3 +-
 vllm/multimodal/hasher.py                     |   5 +-
 vllm/multimodal/image.py                      |   3 +-
 vllm/multimodal/inputs.py                     |  32 +-
 vllm/multimodal/parse.py                      |  46 +-
 vllm/multimodal/processing.py                 | 131 ++--
 vllm/multimodal/profiling.py                  |  30 +-
 vllm/multimodal/registry.py                   |  30 +-
 vllm/multimodal/utils.py                      |  24 +-
 vllm/multimodal/video.py                      |   4 +-
 vllm/outputs.py                               |  32 +-
 vllm/platforms/__init__.py                    |  12 +-
 vllm/platforms/cpu.py                         |   6 +-
 vllm/platforms/cuda.py                        |  11 +-
 vllm/platforms/interface.py                   |  42 +-
 vllm/platforms/rocm.py                        |  10 +-
 vllm/platforms/tpu.py                         |   6 +-
 vllm/platforms/xpu.py                         |   8 +-
 vllm/plugins/__init__.py                      |   3 +-
 vllm/plugins/io_processors/__init__.py        |   2 -
 vllm/plugins/io_processors/interface.py       |  14 +-
 .../lora_resolvers/filesystem_resolver.py     |   3 +-
 vllm/pooling_params.py                        |  18 +-
 vllm/profiler/layerwise_profile.py            |  19 +-
 vllm/profiler/utils.py                        |   4 +-
 vllm/ray/ray_env.py                           |   7 +-
 vllm/reasoning/abs_reasoning_parsers.py       |  18 +-
 vllm/reasoning/basic_parsers.py               |   7 +-
 .../reasoning/deepseek_r1_reasoning_parser.py |   3 +-
 vllm/reasoning/glm4_moe_reasoning_parser.py   |   5 +-
 vllm/reasoning/gptoss_reasoning_parser.py     |   5 +-
 vllm/reasoning/granite_reasoning_parser.py    |   7 +-
 .../hunyuan_a13b_reasoning_parser.py          |   5 +-
 vllm/reasoning/olmo3_reasoning_parser.py      |  16 +-
 vllm/reasoning/qwen3_reasoning_parser.py      |   5 +-
 vllm/reasoning/step3_reasoning_parser.py      |   5 +-
 vllm/sampling_params.py                       | 102 +--
 vllm/scalar_type.py                           |  13 +-
 vllm/sequence.py                              |  20 +-
 vllm/tracing.py                               |   7 +-
 .../chat_templates/registry.py                |  11 +-
 vllm/transformers_utils/config.py             |  93 +--
 vllm/transformers_utils/config_parser_base.py |   7 +-
 vllm/transformers_utils/configs/dotsocr.py    |   4 +-
 vllm/transformers_utils/configs/eagle.py      |  11 +-
 vllm/transformers_utils/configs/kimi_vl.py    |   5 +-
 vllm/transformers_utils/configs/lfm2_moe.py   |   3 +-
 vllm/transformers_utils/configs/medusa.py     |   5 +-
 .../transformers_utils/configs/midashenglm.py |  13 +-
 .../configs/mlp_speculator.py                 |   3 +-
 vllm/transformers_utils/configs/ovis.py       |   8 +-
 vllm/transformers_utils/configs/radio.py      |   8 +-
 .../configs/speculators/base.py               |   4 +-
 vllm/transformers_utils/configs/step3_vl.py   |   8 +-
 vllm/transformers_utils/configs/ultravox.py   |  10 +-
 vllm/transformers_utils/detokenizer_utils.py  |   5 +-
 vllm/transformers_utils/dynamic_module.py     |  17 +-
 vllm/transformers_utils/processor.py          |  22 +-
 vllm/transformers_utils/processors/ovis.py    |   8 +-
 vllm/transformers_utils/processors/ovis2_5.py |  18 +-
 vllm/transformers_utils/runai_utils.py        |   5 +-
 vllm/transformers_utils/s3_utils.py           |   6 +-
 vllm/transformers_utils/tokenizer.py          |  18 +-
 vllm/transformers_utils/tokenizer_base.py     |  22 +-
 vllm/transformers_utils/tokenizers/mistral.py |  30 +-
 vllm/transformers_utils/utils.py              |  14 +-
 vllm/usage/usage_lib.py                       |  42 +-
 vllm/utils/__init__.py                        |  64 +-
 vllm/utils/cache.py                           |  24 +-
 vllm/utils/deep_gemm.py                       |   5 +-
 vllm/utils/flashinfer.py                      |   5 +-
 vllm/utils/gc_utils.py                        |   4 +-
 vllm/utils/jsontree.py                        |  57 +-
 vllm/utils/tensor_schema.py                   |  22 +-
 vllm/v1/attention/backends/cpu_attn.py        |  58 +-
 vllm/v1/attention/backends/flash_attn.py      |  49 +-
 vllm/v1/attention/backends/flashinfer.py      |  18 +-
 vllm/v1/attention/backends/flex_attention.py  |  43 +-
 vllm/v1/attention/backends/gdn_attn.py        |  29 +-
 vllm/v1/attention/backends/mamba1_attn.py     |   3 +-
 vllm/v1/attention/backends/mamba2_attn.py     |  15 +-
 vllm/v1/attention/backends/mla/common.py      |  69 +-
 vllm/v1/attention/backends/mla/cutlass_mla.py |  16 +-
 .../attention/backends/mla/flashattn_mla.py   |  18 +-
 .../attention/backends/mla/flashinfer_mla.py  |  18 +-
 vllm/v1/attention/backends/mla/flashmla.py    |  18 +-
 .../attention/backends/mla/flashmla_sparse.py |  20 +-
 vllm/v1/attention/backends/mla/indexer.py     |   8 +-
 .../attention/backends/mla/rocm_aiter_mla.py  |  24 +-
 vllm/v1/attention/backends/mla/triton_mla.py  |  13 +-
 vllm/v1/attention/backends/pallas.py          |  17 +-
 vllm/v1/attention/backends/rocm_aiter_fa.py   |  29 +-
 .../backends/rocm_aiter_unified_attn.py       |  18 +-
 vllm/v1/attention/backends/rocm_attn.py       |  28 +-
 vllm/v1/attention/backends/short_conv_attn.py |   9 +-
 vllm/v1/attention/backends/tree_attn.py       |  24 +-
 vllm/v1/attention/backends/triton_attn.py     |  30 +-
 vllm/v1/attention/backends/utils.py           |  22 +-
 vllm/v1/attention/backends/xformers.py        |  18 +-
 vllm/v1/core/block_pool.py                    |  14 +-
 vllm/v1/core/kv_cache_coordinator.py          |   5 +-
 vllm/v1/core/kv_cache_manager.py              |  14 +-
 vllm/v1/core/kv_cache_utils.py                |  20 +-
 vllm/v1/core/sched/async_scheduler.py         |   2 -
 vllm/v1/core/sched/interface.py               |   4 +-
 vllm/v1/core/sched/output.py                  |  17 +-
 vllm/v1/core/sched/request_queue.py           |   4 +-
 vllm/v1/core/sched/scheduler.py               |   6 +-
 vllm/v1/core/sched/utils.py                   |   3 +-
 vllm/v1/cudagraph_dispatcher.py               |   3 +-
 vllm/v1/engine/__init__.py                    |  54 +-
 vllm/v1/engine/async_llm.py                   |  70 +-
 vllm/v1/engine/coordinator.py                 |   5 +-
 vllm/v1/engine/core.py                        |  48 +-
 vllm/v1/engine/core_client.py                 | 102 ++-
 vllm/v1/engine/detokenizer.py                 |  11 +-
 vllm/v1/engine/llm_engine.py                  |  42 +-
 vllm/v1/engine/logprobs.py                    |  19 +-
 vllm/v1/engine/output_processor.py            |  90 ++-
 vllm/v1/engine/parallel_sampling.py           |   2 +-
 vllm/v1/engine/processor.py                   |  34 +-
 vllm/v1/engine/utils.py                       |  28 +-
 vllm/v1/executor/abstract.py                  |  13 +-
 vllm/v1/executor/multiproc_executor.py        |  31 +-
 vllm/v1/executor/ray_distributed_executor.py  |   5 +-
 vllm/v1/kv_cache_interface.py                 |  13 +-
 vllm/v1/kv_offload/abstract.py                |   3 +-
 vllm/v1/kv_offload/cpu.py                     |   5 +-
 vllm/v1/kv_offload/factory.py                 |   3 +-
 vllm/v1/kv_offload/lru_manager.py             |   5 +-
 vllm/v1/metrics/loggers.py                    |  55 +-
 vllm/v1/metrics/prometheus.py                 |   3 +-
 vllm/v1/metrics/ray_wrappers.py               |  23 +-
 vllm/v1/metrics/reader.py                     |   3 +-
 vllm/v1/metrics/stats.py                      |  22 +-
 vllm/v1/outputs.py                            |  24 +-
 vllm/v1/pool/metadata.py                      |   5 +-
 vllm/v1/request.py                            |  44 +-
 vllm/v1/sample/logits_processor/__init__.py   |  16 +-
 vllm/v1/sample/logits_processor/builtin.py    |  18 +-
 vllm/v1/sample/logits_processor/interface.py  |   2 +-
 vllm/v1/sample/logits_processor/state.py      |  18 +-
 vllm/v1/sample/metadata.py                    |  15 +-
 vllm/v1/sample/ops/topk_topp_sampler.py       |  27 +-
 vllm/v1/sample/rejection_sampler.py           |   9 +-
 vllm/v1/sample/sampler.py                     |   6 +-
 vllm/v1/sample/tpu/metadata.py                |   3 +-
 vllm/v1/sample/tpu/sampler.py                 |   6 +-
 vllm/v1/serial_utils.py                       |  20 +-
 vllm/v1/spec_decode/eagle.py                  |  11 +-
 vllm/v1/spec_decode/metrics.py                |   3 +-
 vllm/v1/structured_output/__init__.py         |   7 +-
 vllm/v1/structured_output/backend_guidance.py |  10 +-
 .../backend_lm_format_enforcer.py             |   4 +-
 vllm/v1/structured_output/backend_outlines.py |   2 -
 vllm/v1/structured_output/backend_types.py    |   9 +-
 vllm/v1/structured_output/backend_xgrammar.py |   2 -
 vllm/v1/structured_output/request.py          |  14 +-
 vllm/v1/structured_output/utils.py            |   7 +-
 vllm/v1/utils.py                              |  22 +-
 vllm/v1/worker/block_table.py                 |   3 +-
 vllm/v1/worker/cpu_model_runner.py            |   4 +-
 vllm/v1/worker/cpu_worker.py                  |   4 +-
 vllm/v1/worker/dp_utils.py                    |  11 +-
 vllm/v1/worker/gpu_input_batch.py             |  44 +-
 vllm/v1/worker/gpu_model_runner.py            |  83 ++-
 vllm/v1/worker/gpu_ubatch_wrapper.py          |   9 +-
 vllm/v1/worker/gpu_worker.py                  |  18 +-
 .../worker/kv_connector_model_runner_mixin.py |   7 +-
 vllm/v1/worker/lora_model_runner_mixin.py     |  11 +-
 vllm/v1/worker/tpu_input_batch.py             |  18 +-
 vllm/v1/worker/tpu_model_runner.py            |  14 +-
 vllm/v1/worker/tpu_worker.py                  |   7 +-
 vllm/v1/worker/ubatch_utils.py                |   2 +-
 vllm/v1/worker/utils.py                       |  10 +-
 vllm/v1/worker/worker_base.py                 |  10 +-
 944 files changed, 9490 insertions(+), 10121 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index ba7c733be0b25..4021fede72153 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -8,7 +8,6 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import Optional, Union
 
 import aiohttp
 import huggingface_hub.constants
@@ -28,13 +27,13 @@ class RequestFuncInput:
     prompt_len: int
     output_len: int
     model: str
-    model_name: Optional[str] = None
-    logprobs: Optional[int] = None
-    extra_body: Optional[dict] = None
-    multi_modal_content: Optional[dict | list[dict]] = None
+    model_name: str | None = None
+    logprobs: int | None = None
+    extra_body: dict | None = None
+    multi_modal_content: dict | list[dict] | None = None
     ignore_eos: bool = False
-    language: Optional[str] = None
-    request_id: Optional[str] = None
+    language: str | None = None
+    request_id: str | None = None
 
 
 @dataclass
@@ -52,7 +51,7 @@ class RequestFuncOutput:
 
 async def async_request_tgi(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith("generate_stream")
@@ -133,7 +132,7 @@ async def async_request_tgi(
 
 async def async_request_trt_llm(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith("generate_stream")
@@ -204,7 +203,7 @@ async def async_request_trt_llm(
 
 async def async_request_deepspeed_mii(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(("completions", "profile")), (
@@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
 
 async def async_request_openai_completions(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(("completions", "profile")), (
@@ -367,7 +366,7 @@ async def async_request_openai_completions(
 
 async def async_request_openai_chat_completions(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(("chat/completions", "profile")), (
@@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
 
 async def async_request_openai_audio(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     # Lazy import without PlaceholderModule to avoid vllm dep.
     import soundfile
@@ -610,7 +609,7 @@ def get_tokenizer(
     tokenizer_mode: str = "auto",
     trust_remote_code: bool = False,
     **kwargs,
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
     if pretrained_model_name_or_path is not None and not os.path.exists(
         pretrained_model_name_or_path
     ):
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index b5e2613de1cd4..d7dc0e991c4d1 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -32,7 +32,6 @@ import dataclasses
 import json
 import random
 import time
-from typing import Optional
 
 from transformers import PreTrainedTokenizerBase
 
@@ -80,7 +79,7 @@ def sample_requests_from_dataset(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     input_length_range: tuple[int, int],
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
 ) -> list[Request]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
@@ -128,7 +127,7 @@ def sample_requests_from_random(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     input_length_range: tuple[int, int],
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
     prefix_len: int,
 ) -> list[Request]:
     requests = []
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index bb453791c1862..769f52dbab6ea 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -7,7 +7,6 @@ import dataclasses
 import json
 import random
 import time
-from typing import Optional
 
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
@@ -24,7 +23,7 @@ def sample_requests(
     dataset_path: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
+    fixed_output_len: int | None,
 ) -> list[tuple[str, int, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 58b9767d09390..059668f1789cc 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -32,7 +32,6 @@ import uuid
 import warnings
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
-from typing import Optional
 
 import datasets
 import numpy as np
@@ -316,7 +315,7 @@ def calculate_metrics(
     tokenizer: PreTrainedTokenizerBase,
     selected_percentile_metrics: list[str],
     selected_percentiles: list[float],
-    goodput_config_dict: Optional[dict[str, float]] = None,
+    goodput_config_dict: dict[str, float] | None = None,
 ) -> tuple[BenchmarkMetrics, list[int]]:
     actual_output_lens: list[int] = []
     total_input = 0
@@ -436,9 +435,9 @@ async def benchmark(
     selected_percentile_metrics: list[str],
     selected_percentiles: list[str],
     ignore_eos: bool,
-    max_concurrency: Optional[int],
+    max_concurrency: int | None,
     structured_output_ratio: float,
-    goodput_config_dict: Optional[dict[str, float]] = None,
+    goodput_config_dict: dict[str, float] | None = None,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 98624abdf49fb..f0d661f9d5349 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -6,7 +6,7 @@ import math
 import os
 import time
 from types import TracebackType
-from typing import Any, Optional, Union
+from typing import Any
 
 
 def convert_to_pytorch_benchmark_format(
@@ -92,7 +92,7 @@ class TimeCollector:
     def __init__(self, scale: int) -> None:
         self.cnt: int = 0
         self._sum: int = 0
-        self._max: Optional[int] = None
+        self._max: int | None = None
         self.scale = scale
         self.start_time: int = time.monotonic_ns()
 
@@ -104,13 +104,13 @@ class TimeCollector:
         else:
             self._max = max(self._max, v)
 
-    def avg(self) -> Union[float, str]:
+    def avg(self) -> float | str:
         return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
 
-    def max(self) -> Union[float, str]:
+    def max(self) -> float | str:
         return self._max / self.scale if self._max else "N/A"
 
-    def dump_avg_max(self) -> list[Union[float, str]]:
+    def dump_avg_max(self) -> list[float | str]:
         return [self.avg(), self.max()]
 
     def __enter__(self) -> None:
@@ -118,8 +118,8 @@ class TimeCollector:
 
     def __exit__(
         self,
-        exc_type: Optional[type[BaseException]],
-        exc_value: Optional[BaseException],
-        exc_traceback: Optional[TracebackType],
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
     ) -> None:
         self.collect(time.monotonic_ns() - self.start_time)
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index 9ec270bbd2e98..22fc2678fd1c9 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -6,8 +6,7 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Iterable
-from typing import Callable
+from collections.abc import Callable, Iterable
 
 import torch
 import torch.utils.benchmark as TBenchmark
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 02f8c593392c4..2deebf3ddb7ae 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -6,8 +6,7 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from collections.abc import Iterable
-from typing import Callable, Optional
+from collections.abc import Callable, Iterable
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -53,7 +52,7 @@ def bench_int8(
     n: int,
     label: str,
     sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
     """Benchmark INT8-based kernels."""
     assert dtype == torch.int8
@@ -108,7 +107,7 @@ def bench_fp8(
     n: int,
     label: str,
     sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
     """Benchmark FP8-based kernels."""
     assert dtype == torch.float8_e4m3fn
@@ -183,7 +182,7 @@ def bench(
     n: int,
     label: str,
     sub_label: str,
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
     if dtype == torch.int8:
         return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
@@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
 def run(
     dtype: torch.dtype,
     MKNs: Iterable[tuple[int, int, int]],
-    bench_kernels: Optional[list[str]] = None,
+    bench_kernels: list[str] | None = None,
 ) -> Iterable[TMeasurement]:
     results = []
     for m, k, n in MKNs:
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index 901524214469e..d809bf1db8cbc 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -3,10 +3,9 @@
 
 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Optional
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
 def unfused_int8_impl(
     rms_norm_layer: RMSNorm,
     x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
 ):
     # Norm
@@ -68,7 +67,7 @@ def unfused_int8_impl(
 def unfused_fp8_impl(
     rms_norm_layer: RMSNorm,
     x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
 ):
     # Norm
@@ -85,7 +84,7 @@ def unfused_fp8_impl(
 def fused_impl(
     rms_norm_layer: RMSNorm,  # this stores the weights
     x: torch.Tensor,
-    residual: Optional[torch.Tensor],
+    residual: torch.Tensor | None,
     quant_dtype: torch.dtype,
 ):
     out, _ = ops.rms_norm_dynamic_per_token_quant(
diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py
index e08e5680c191e..9a52ea7f47e3a 100644
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-from typing import Callable
+from collections.abc import Callable
 from unittest.mock import patch
 
 import pandas as pd
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
index 4cbdde5a5b2ca..df06a940e6d41 100644
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -22,8 +22,8 @@ Example:
 import json
 import os
 import time
+from collections.abc import Callable
 from contextlib import nullcontext
-from typing import Callable, Optional
 
 import torch
 import torch.distributed as dist
@@ -264,12 +264,12 @@ class CommunicatorBenchmark:
     def benchmark_allreduce_single(
         self,
         sequence_length: int,
-        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
+        allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
         should_use_fn: Callable[[torch.Tensor], bool],
         context,
         num_warmup: int,
         num_trials: int,
-    ) -> Optional[float]:
+    ) -> float | None:
         """Benchmark method with CUDA graph optimization."""
         try:
             # Create test tensor (2D: sequence_length x hidden_size)
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 799b16999873f..39338f3387613 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -6,11 +6,12 @@ import copy
 import json
 import pickle
 import time
+from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum, auto
 from itertools import product
 from pathlib import Path
-from typing import Any, Callable, Optional
+from typing import Any
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -158,7 +159,7 @@ def ref_group_gemm(
     seq_lens_cpu: torch.Tensor,
     prompt_lora_mapping_cpu: torch.Tensor,
     scaling: float,
-    add_inputs: Optional[bool],
+    add_inputs: bool | None,
 ):
     """
     Torch group gemm reference implementation to test correctness of
@@ -316,8 +317,8 @@ class BenchmarkContext:
     lora_rank: int
     sort_by_lora_id: bool
     dtype: torch.dtype
-    seq_length: Optional[int] = None
-    num_slices: Optional[int] = None  # num_slices for slice based ops
+    seq_length: int | None = None
+    num_slices: int | None = None  # num_slices for slice based ops
 
     def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
         ctx = copy.copy(self)
@@ -561,7 +562,7 @@ class BenchmarkTensors:
         }
 
     def bench_fn_kwargs(
-        self, op_type: OpType, add_inputs: Optional[bool] = None
+        self, op_type: OpType, add_inputs: bool | None = None
     ) -> dict[str, Any]:
         if op_type.is_shrink_fn():
             assert add_inputs is None
@@ -575,7 +576,7 @@ class BenchmarkTensors:
         raise ValueError(f"Unrecognized optype {self}")
 
     def test_correctness(
-        self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
+        self, op_type: OpType, expand_fn_add_inputs: bool | None
     ) -> bool:
         """
         Test correctness of op_type implementation against a grouped gemm
@@ -611,8 +612,8 @@ def bench_optype(
     ctx: BenchmarkContext,
     arg_pool_size: int,
     op_type: OpType,
-    cuda_graph_nops: Optional[int] = None,
-    expand_fn_add_inputs: Optional[bool] = None,
+    cuda_graph_nops: int | None = None,
+    expand_fn_add_inputs: bool | None = None,
     test_correctness: bool = False,
 ) -> TMeasurement:
     assert arg_pool_size >= 1
@@ -679,7 +680,7 @@ def bench_torch_mm(
     ctx: BenchmarkContext,
     arg_pool_size: int,
     op_type: OpType,
-    cuda_graph_nops: Optional[int] = None,
+    cuda_graph_nops: int | None = None,
 ) -> TMeasurement:
     """
     Benchmark basic torch.mm as a roofline.
@@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
             """
 
 
-def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
+def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
     compare = TBenchmark.Compare(timers)
     compare.print()
 
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 1b1c3b321cce4..e1d5239f5cc97 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -8,10 +8,9 @@ import math
 import os
 import pickle as pkl
 import time
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Optional
 
 import pandas as pd
 import torch
@@ -63,23 +62,23 @@ class BenchmarkTensors:
     a: torch.Tensor
 
     w_q: torch.Tensor
-    group_size: Optional[int]
+    group_size: int | None
     wtype: ScalarType
     w_g_s: torch.Tensor
-    w_g_zp: Optional[torch.Tensor]
-    w_ch_s: Optional[torch.Tensor]
-    w_tok_s: Optional[torch.Tensor]
+    w_g_zp: torch.Tensor | None
+    w_ch_s: torch.Tensor | None
+    w_tok_s: torch.Tensor | None
 
 
 @dataclass
 class TypeConfig:
     act_type: torch.dtype
     weight_type: ScalarType
-    output_type: Optional[torch.dtype]
-    group_scale_type: Optional[torch.dtype]
-    group_zero_type: Optional[torch.dtype]
-    channel_scale_type: Optional[torch.dtype]
-    token_scale_type: Optional[torch.dtype]
+    output_type: torch.dtype | None
+    group_scale_type: torch.dtype | None
+    group_zero_type: torch.dtype | None
+    channel_scale_type: torch.dtype | None
+    token_scale_type: torch.dtype | None
 
 
 def rand_data(shape, dtype=torch.float16, scale=1):
@@ -93,8 +92,8 @@ def quantize_and_pack(
     atype: torch.dtype,
     w: torch.Tensor,
     wtype: ScalarType,
-    stype: Optional[torch.dtype],
-    group_size: Optional[int],
+    stype: torch.dtype | None,
+    group_size: int | None,
     zero_points: bool = False,
 ):
     assert wtype.is_integer(), "TODO: support floating point weights"
@@ -113,7 +112,7 @@ def quantize_and_pack(
 
 
 def create_bench_tensors(
-    shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
+    shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
 ) -> list[BenchmarkTensors]:
     m, n, k = shape
 
@@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
     return res
 
 
-_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
-_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
+_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
+_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
 
 
 def bench(
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 7e0376c18ecc7..8f9907952d24d 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -3,7 +3,6 @@
 
 import random
 import time
-from typing import Optional
 
 import torch
 
@@ -37,7 +36,7 @@ def main(
     seed: int,
     do_profile: bool,
     device: str = "cuda",
-    kv_cache_dtype: Optional[str] = None,
+    kv_cache_dtype: str | None = None,
 ) -> None:
     current_platform.seed_everything(seed)
 
diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py
index 1ccb5e08b3d57..bdc1eb733084e 100644
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -3,8 +3,8 @@
 
 import argparse
 import math
+from collections.abc import Callable
 from contextlib import contextmanager
-from typing import Callable
 from unittest.mock import patch
 
 import torch
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py
index af9841daadf24..d4b564d2ec6c9 100644
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import random
 import time
 
diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
index 0aace571064a0..93df14f0d95cc 100644
--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import random
 import time
 
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
index 4cf633a81358d..d8d7f5bcf9dad 100644
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
-from typing import Optional, Union
 
 import torch
 from flashinfer.norm import fused_add_rmsnorm, rmsnorm
@@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         orig_dtype = x.dtype
         x = x.to(torch.float32)
         if residual is not None:
@@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
 def rmsnorm_naive(
     x: torch.Tensor,
     weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
     eps: float = 1e-6,
 ):
     naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
@@ -65,7 +64,7 @@ def rmsnorm_naive(
 def rmsnorm_flashinfer(
     x: torch.Tensor,
     weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
     eps: float = 1e-6,
 ):
     orig_shape = x.shape
@@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
 def rmsnorm_vllm(
     x: torch.Tensor,
     weight: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
+    residual: torch.Tensor | None = None,
     eps: float = 1e-6,
 ):
     orig_shape = x.shape
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index b81baf17a8c67..24869c91a8d70 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import accumulate
-from typing import Optional
 
 import nvtx
 import torch
@@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
     seq_len: int,
     num_heads: int,
     head_size: int,
-    rotary_dim: Optional[int],
+    rotary_dim: int | None,
     dtype: torch.dtype,
     seed: int,
     device: str,
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 6ddab46214577..f7cdc25794cae 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -4,7 +4,6 @@
 import csv
 import os
 from datetime import datetime
-from typing import Optional
 
 import flashinfer
 import torch
@@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 @torch.no_grad()
 def benchmark_decode(
     dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
     batch_size: int,
     max_seq_len: int,
     num_heads: tuple[int, int] = (64, 8),
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 131df74c7de1b..7993354475fcc 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -4,7 +4,6 @@
 import csv
 import os
 from datetime import datetime
-from typing import Optional
 
 import flashinfer
 import torch
@@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 @torch.no_grad()
 def benchmark_prefill(
     dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
     batch_size: int,
     max_seq_len: int,
     num_heads: tuple[int, int] = (64, 8),
diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py
index 4bbb36bb43592..a9af811bbe9ca 100644
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
-from collections.abc import Iterable
-from typing import Any, Callable, Optional
+from collections.abc import Callable, Iterable
+from typing import Any
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -55,7 +55,7 @@ class Bench:
 
     def __init__(
         self,
-        cuda_graph_params: Optional[CudaGraphBenchParams],
+        cuda_graph_params: CudaGraphBenchParams | None,
         label: str,
         sub_label: str,
         description: str,
diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py
index 67b937930d58c..2674899d1cc56 100644
--- a/benchmarks/multi_turn/bench_dataset.py
+++ b/benchmarks/multi_turn/bench_dataset.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from statistics import mean
-from typing import Any, NamedTuple, Optional, Union
+from typing import Any, NamedTuple
 
 import numpy as np  # type: ignore
 import pandas as pd  # type: ignore
@@ -35,8 +35,8 @@ class Distribution(ABC):
 class UniformDistribution(Distribution):
     def __init__(
         self,
-        min_val: Union[int, float],
-        max_val: Union[int, float],
+        min_val: int | float,
+        max_val: int | float,
         is_integer: bool = True,
     ) -> None:
         self.min_val = min_val
@@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
 
 
 class ConstantDistribution(Distribution):
-    def __init__(self, value: Union[int, float]) -> None:
+    def __init__(self, value: int | float) -> None:
         self.value = value
         self.max_val = value
 
@@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
 
 
 class ZipfDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+    def __init__(self, alpha: float, max_val: int | None = None) -> None:
         self.alpha = alpha
         self.max_val = max_val
 
@@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
 
 
 class PoissonDistribution(Distribution):
-    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
+    def __init__(self, alpha: float, max_val: int | None = None) -> None:
         self.alpha = alpha
         self.max_val = max_val
 
@@ -100,11 +100,11 @@ class PoissonDistribution(Distribution):
 class LognormalDistribution(Distribution):
     def __init__(
         self,
-        mean: Optional[float] = None,
-        sigma: Optional[float] = None,
-        average: Optional[int] = None,
-        median_ratio: Optional[float] = None,
-        max_val: Optional[int] = None,
+        mean: float | None = None,
+        sigma: float | None = None,
+        average: int | None = None,
+        median_ratio: float | None = None,
+        max_val: int | None = None,
     ) -> None:
         self.average = average
         self.median_ratio = median_ratio
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 233ed460fc8d5..2b0a6da60c256 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -13,7 +13,7 @@ from datetime import datetime
 from enum import Enum
 from http import HTTPStatus
 from statistics import mean
-from typing import NamedTuple, Union
+from typing import NamedTuple
 
 import aiohttp  # type: ignore
 import numpy as np  # type: ignore
@@ -169,7 +169,7 @@ class MovingAverage:
 class DebugStats:
     def __init__(self, logger: logging.Logger, window_size: int) -> None:
         self.logger = logger
-        self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
+        self.metrics: dict[str, MovingAverage | MetricStats] = {
             "moving_avg_ttft_ms": MovingAverage(window_size),
             "moving_avg_tpot_ms": MovingAverage(window_size),
             "ttft_ms": MetricStats(),
@@ -636,7 +636,7 @@ async def client_main(
 
             if args.verbose:
                 curr_time_sec: float = time.perf_counter()
-                time_since_last_turn: Union[str, float] = "N/A"
+                time_since_last_turn: str | float = "N/A"
                 if conv_id in time_of_last_turn:
                     time_since_last_turn = round(
                         curr_time_sec - time_of_last_turn[conv_id], 3
@@ -928,13 +928,13 @@ async def main_mp(
                     f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}"  # noqa: E501
                 )
 
-                rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
+                rps: str | float = round(len(client_metrics) / runtime_sec, 3)
                 if len(client_metrics) < (5 * bench_args.num_clients):
                     # Do not estimate the RPS if the number of samples is very low
                     # (threshold can be tuned if needed)
                     rps = "N/A"
 
-                runtime_left_sec: Union[str, float] = round(
+                runtime_left_sec: str | float = round(
                     (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
                 )
                 if percent < 0.05:
diff --git a/benchmarks/multi_turn/convert_sharegpt_to_openai.py b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
index c3622c99a2e53..fccab4d0ce21a 100644
--- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py
+++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py
@@ -13,7 +13,7 @@ import argparse
 import json
 import random
 from statistics import mean
-from typing import Any, Optional
+from typing import Any
 
 import pandas as pd  # type: ignore
 import tqdm  # type: ignore
@@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
 
 
 def content_is_valid(
-    content: str, min_content_len: Optional[int], max_content_len: Optional[int]
+    content: str, min_content_len: int | None, max_content_len: int | None
 ) -> bool:
     if min_content_len and len(content) < min_content_len:
         return False
@@ -37,7 +37,7 @@ def content_is_valid(
 
 
 def print_stats(
-    conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
+    conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
 ) -> None:
     # Collect statistics
     stats = []
@@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
     seed: int,
     input_file: str,
     output_file: str,
-    max_items: Optional[int],
-    min_content_len: Optional[int] = None,
-    max_content_len: Optional[int] = None,
-    min_turns: Optional[int] = None,
-    max_turns: Optional[int] = None,
-    model: Optional[str] = None,
+    max_items: int | None,
+    min_content_len: int | None = None,
+    max_content_len: int | None = None,
+    min_turns: int | None = None,
+    max_turns: int | None = None,
+    model: str | None = None,
 ) -> None:
     if min_turns and max_turns:
         assert min_turns <= max_turns
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index 5e742d0b02932..34fb64c413db2 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
-from typing import Union
 
 from cutlass_library import *
 
@@ -22,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
     TmaWarpSpecializedCooperative = enum_auto()
 
 
-VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = {
     **DataTypeNames,  # type: ignore
     **{
         VLLMDataType.u4b8: "u4b8",
@@ -30,7 +29,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
     },
 }
 
-VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = {
     **DataTypeTag,  # type: ignore
     **{
         VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
@@ -38,7 +37,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
     },
 }
 
-VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
+VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = {
     **DataTypeSize,  # type: ignore
     **{
         VLLMDataType.u4b8: 4,
@@ -46,7 +45,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
     },
 }
 
-VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = {
     VLLMDataType.u4b8: "vllm::kU4B8",
     VLLMDataType.u8b128: "vllm::kU8B128",
     DataType.u4: "vllm::kU4",
@@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
     DataType.bf16: "vllm::kBfloat16",
 }
 
-VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = {
     DataType.u8: "at::ScalarType::Byte",
     DataType.s8: "at::ScalarType::Char",
     DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@@ -67,9 +66,7 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
     DataType.f32: "at::ScalarType::Float",
 }
 
-VLLMKernelScheduleTag: dict[
-    Union[MixedInputKernelScheduleType, KernelScheduleType], str
-] = {
+VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = {
     **KernelScheduleTag,  # type: ignore
     **{
         MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",  # noqa: E501
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index d29a199c5d32f..8bd17ba69cec1 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -9,7 +9,6 @@ from collections.abc import Iterable
 from copy import deepcopy
 from dataclasses import dataclass, fields
 from functools import reduce
-from typing import Optional, Union
 
 import jinja2
 from vllm_cutlass_library_extension import (
@@ -259,7 +258,7 @@ class ScheduleConfig:
 @dataclass(frozen=True)
 class TypeConfig:
     a: DataType
-    b: Union[DataType, VLLMDataType]
+    b: DataType | VLLMDataType
     b_group_scale: DataType
     b_group_zeropoint: DataType
     b_channel_scale: DataType
@@ -280,7 +279,7 @@ class PrepackTypeConfig:
 class ImplConfig:
     types: TypeConfig
     schedules: list[ScheduleConfig]
-    heuristic: list[tuple[Optional[str], ScheduleConfig]]
+    heuristic: list[tuple[str | None, ScheduleConfig]]
 
 
 def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
index 62e58e5c6ac58..4ce748ce1fed4 100644
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -16,7 +16,7 @@ Declare supported languages and capabilities:
 
 ??? code "supported_languages and supports_transcription_only"
     ```python
-    from typing import ClassVar, Mapping, Optional, Literal
+    from typing import ClassVar, Mapping, Literal
     import numpy as np
     import torch
     from torch import nn
@@ -81,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
             audio: np.ndarray,
             stt_config: SpeechToTextConfig,
             model_config: ModelConfig,
-            language: Optional[str],
+            language: str | None,
             task_type: Literal["transcribe", "translate"],
             request_prompt: str,
-            to_language: Optional[str],
+            to_language: str | None,
         ) -> PromptType:
             # Example with a free-form instruction prompt
             task_word = "Transcribe" if task_type == "transcribe" else "Translate"
@@ -117,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
             audio: np.ndarray,
             stt_config: SpeechToTextConfig,
             model_config: ModelConfig,
-            language: Optional[str],
+            language: str | None,
             task_type: Literal["transcribe", "translate"],
             request_prompt: str,
-            to_language: Optional[str],
+            to_language: str | None,
         ) -> PromptType:
             if language is None:
                 raise ValueError("Language must be specified")
@@ -150,7 +150,7 @@ If your model requires a language and you want a default, override this method (
 ??? code "validate_language()"
     ```python
     @classmethod
-    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+    def validate_language(cls, language: str | None) -> str | None:
         if language is None:
             logger.warning(
                 "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.")
@@ -175,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
             audio_duration_s: float,
             stt_config: SpeechToTextConfig,
             model_config: ModelConfig,
-        ) -> Optional[int]:
+        ) -> int | None:
             # Return None if unknown; otherwise return an estimate.
             return int(audio_duration_s * stt_config.sample_rate // 320)  # example
     ```
diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md
index 20d78ca3aae2c..da61d2a85e466 100644
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
     from collections.abc import Sequence
     from dataclasses import dataclass
     from enum import Enum, auto
-    from typing import TYPE_CHECKING, Optional
+    from typing import TYPE_CHECKING
 
     import torch
 
@@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus
         @abstractmethod
         def update_state(
             self,
-            batch_update: Optional["BatchUpdate"],
+            batch_update: "BatchUpdate" | None,
         ) -> None:
             """Called when there are new output tokens, prior
             to each forward pass.
@@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum)
     * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax
     * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling
 
-* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`:
+* `update_state(self, batch_update: "BatchUpdate" | None) -> None`:
     * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step
     * Use the `BatchUpdate` members to update logits processor internal state
     * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added.
diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md
index 201b340c5972c..b8ad53863cd7a 100644
--- a/docs/features/custom_logitsprocs.md
+++ b/docs/features/custom_logitsprocs.md
@@ -93,7 +93,6 @@ The contrived example below implements a custom logits processor which consumes
 ??? code "Example custom logits processor definition"
 
     ``` python
-    from typing import Optional
     import torch
     from vllm.config import VllmConfig
     from vllm.sampling_params import SamplingParams
@@ -112,7 +111,7 @@ The contrived example below implements a custom logits processor which consumes
             """Never impacts greedy sampling"""
             return False
 
-        def update_state(self, batch_update: Optional[BatchUpdate]):
+        def update_state(self, batch_update: BatchUpdate | None):
             if not batch_update:
                 return
 
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index a36664e470450..c4eed2037781a 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -10,7 +10,7 @@ on HuggingFace model repository.
 
 import os
 from dataclasses import asdict
-from typing import Any, NamedTuple, Optional
+from typing import Any, NamedTuple
 
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
@@ -30,11 +30,11 @@ question_per_audio_count = {
 
 class ModelRequestData(NamedTuple):
     engine_args: EngineArgs
-    prompt: Optional[str] = None
-    prompt_token_ids: Optional[dict[str, list[int]]] = None
-    multi_modal_data: Optional[dict[str, Any]] = None
-    stop_token_ids: Optional[list[int]] = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    prompt: str | None = None
+    prompt_token_ids: dict[str, list[int]] | None = None
+    multi_modal_data: dict[str, Any] | None = None
+    stop_token_ids: list[int] | None = None
+    lora_requests: list[LoRARequest] | None = None
 
 
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
diff --git a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py b/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
index 0abe7d1612610..5b2acea4c9457 100644
--- a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
+++ b/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py
@@ -3,7 +3,7 @@
 # ruff: noqa: E501
 import logging
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
@@ -81,7 +81,7 @@ class RogueSharedStorageConnector(SharedStorageConnector):
 
     def get_finished(
         self, finished_req_ids: set[str]
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
         if self._async_load:
             meta = self._get_connector_metadata()
             assert isinstance(meta, RogueSharedStorageConnectorMetadata)
diff --git a/examples/offline_inference/logits_processor/custom.py b/examples/offline_inference/logits_processor/custom.py
index 4112a498f37ab..72e7ce24d7cc8 100644
--- a/examples/offline_inference/logits_processor/custom.py
+++ b/examples/offline_inference/logits_processor/custom.py
@@ -33,8 +33,6 @@ Output:    ' in the hands of the people.\n\nThe future of AI is in the'
 ------------------------------------------------------------
 """
 
-from typing import Optional
-
 import torch
 
 from vllm import LLM, SamplingParams
@@ -58,7 +56,7 @@ class DummyLogitsProcessor(LogitsProcessor):
     def is_argmax_invariant(self) -> bool:
         return False
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
         process_dict_updates(
             self.req_info,
             batch_update,
diff --git a/examples/offline_inference/logits_processor/custom_req.py b/examples/offline_inference/logits_processor/custom_req.py
index 4c19bb4ce2bae..87cd7473fa9f1 100644
--- a/examples/offline_inference/logits_processor/custom_req.py
+++ b/examples/offline_inference/logits_processor/custom_req.py
@@ -39,7 +39,7 @@ Output:    ' in the hands of the people.\n\nThe future of AI is in the'
 ------------------------------------------------------------
 """
 
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
@@ -82,7 +82,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
     def new_req_logits_processor(
         self,
         params: SamplingParams,
-    ) -> Optional[RequestLogitsProcessor]:
+    ) -> RequestLogitsProcessor | None:
         """This method returns a new request-level logits processor, customized
         to the `target_token` value associated with a particular request.
 
@@ -96,7 +96,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
         Returns:
           `Callable` request logits processor, or None
         """
-        target_token: Optional[Any] = params.extra_args and params.extra_args.get(
+        target_token: Any | None = params.extra_args and params.extra_args.get(
             "target_token"
         )
         if target_token is None:
diff --git a/examples/offline_inference/logits_processor/custom_req_init.py b/examples/offline_inference/logits_processor/custom_req_init.py
index 62947d122e01c..3bb82a786040b 100644
--- a/examples/offline_inference/logits_processor/custom_req_init.py
+++ b/examples/offline_inference/logits_processor/custom_req_init.py
@@ -41,8 +41,6 @@ which indicates that the logits processor is running. However, on a non-"cuda"
 device, the first and third requests would not repeat the same token.
 """
 
-from typing import Optional
-
 import torch
 
 from vllm import LLM, SamplingParams
@@ -91,7 +89,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
     def new_req_logits_processor(
         self,
         params: SamplingParams,
-    ) -> Optional[RequestLogitsProcessor]:
+    ) -> RequestLogitsProcessor | None:
         """This method returns a new request-level logits processor, customized
         to the `target_token` value associated with a particular request.
 
diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index 00d4cb9eb4c41..dc5c6202fa57b 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -8,7 +8,6 @@ Requires HuggingFace credentials for access.
 """
 
 import gc
-from typing import Optional
 
 import torch
 from huggingface_hub import snapshot_download
@@ -19,7 +18,7 @@ from vllm.lora.request import LoRARequest
 
 def create_test_prompts(
     lora_path: str,
-) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
     return [
         # this is an example of using quantization without LoRA
         (
@@ -56,7 +55,7 @@ def create_test_prompts(
 
 def process_requests(
     engine: LLMEngine,
-    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+    test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
 ):
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
@@ -78,7 +77,7 @@ def process_requests(
 
 
 def initialize_engine(
-    model: str, quantization: str, lora_repo: Optional[str]
+    model: str, quantization: str, lora_repo: str | None
 ) -> LLMEngine:
     """Initialize the LLMEngine."""
 
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 6040683c68bcd..6c23cf342e06b 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -7,8 +7,6 @@ for offline inference.
 Requires HuggingFace credentials for access to Llama2.
 """
 
-from typing import Optional
-
 from huggingface_hub import snapshot_download
 
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
@@ -17,7 +15,7 @@ from vllm.lora.request import LoRARequest
 
 def create_test_prompts(
     lora_path: str,
-) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, LoRARequest | None]]:
     """Create a list of test prompts with their sampling parameters.
 
     2 requests for base model, 4 requests for the LoRA. We define 2
@@ -68,7 +66,7 @@ def create_test_prompts(
 
 def process_requests(
     engine: LLMEngine,
-    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+    test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]],
 ):
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index 1a5879a6d35f5..2c73ed6aa6083 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -3,7 +3,6 @@
 import argparse
 import datetime
 import os
-from typing import Union
 
 import albumentations
 import numpy as np
@@ -160,7 +159,7 @@ def load_example(
     file_paths: list[str],
     mean: list[float] = None,
     std: list[float] = None,
-    indices: Union[list[int], None] = None,
+    indices: list[int] | None = None,
 ):
     """Build an input example by loading images in *file_paths*.
 
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
index c0e60b9793407..13def88439ef2 100644
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
-from typing import Callable, Optional, TypedDict
+from collections.abc import Callable
+from typing import TypedDict
 
 import torch
 import zmq
@@ -71,7 +72,7 @@ class WorkerExtension:
 
 
 def rebuild_ipc(
-    handle: tuple[Callable, tuple], device_id: Optional[int] = None
+    handle: tuple[Callable, tuple], device_id: int | None = None
 ) -> torch.Tensor:
     func, args = handle
     list_args = list(args)
@@ -109,7 +110,7 @@ class ColocateWorkerExtension:
             self._zmq_ctx = zmq.Context()
         socket = self._zmq_ctx.socket(zmq.REP)
         socket.connect(zmq_handles[self.report_device_id()])
-        buffer: Optional[torch.Tensor] = None
+        buffer: torch.Tensor | None = None
         while True:
             payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = (
                 socket.recv_pyobj()
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 9fd9da3b0855e..1f09dabaf74c8 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -12,7 +12,7 @@ import os
 import random
 from contextlib import contextmanager
 from dataclasses import asdict
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
@@ -28,8 +28,8 @@ from vllm.utils import FlexibleArgumentParser
 class ModelRequestData(NamedTuple):
     engine_args: EngineArgs
     prompts: list[str]
-    stop_token_ids: Optional[list[int]] = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    stop_token_ids: list[int] | None = None
+    lora_requests: list[LoRARequest] | None = None
 
 
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index c37d40a23ac20..accb6c742a2b6 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -9,7 +9,7 @@ using the chat template defined by the model.
 import os
 from argparse import Namespace
 from dataclasses import asdict
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 
 from huggingface_hub import snapshot_download
 from PIL.Image import Image
@@ -41,9 +41,9 @@ class ModelRequestData(NamedTuple):
     engine_args: EngineArgs
     prompt: str
     image_data: list[Image]
-    stop_token_ids: Optional[list[int]] = None
-    chat_template: Optional[str] = None
-    lora_requests: Optional[list[LoRARequest]] = None
+    stop_token_ids: list[int] | None = None
+    chat_template: str | None = None
+    lora_requests: list[LoRARequest] | None = None
 
 
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@@ -1251,7 +1251,7 @@ model_example_map = {
 }
 
 
-def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
+def run_generate(model, question: str, image_urls: list[str], seed: int | None):
     req_data = model_example_map[model](question, image_urls)
 
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
@@ -1277,7 +1277,7 @@ def run_generate(model, question: str, image_urls: list[str], seed: Optional[int
         print("-" * 50)
 
 
-def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
+def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
     req_data = model_example_map[model](question, image_urls)
 
     # Disable other modalities to save memory
diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/offline_inference/vision_language_pooling.py
index 33ffb59014d8f..1ce2cdc436d6a 100644
--- a/examples/offline_inference/vision_language_pooling.py
+++ b/examples/offline_inference/vision_language_pooling.py
@@ -11,7 +11,7 @@ on HuggingFace model repository.
 from argparse import Namespace
 from dataclasses import asdict
 from pathlib import Path
-from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
+from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args
 
 from PIL.Image import Image
 
@@ -47,15 +47,15 @@ class TextImagesQuery(TypedDict):
 
 
 QueryModality = Literal["text", "image", "text+image", "text+images"]
-Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
+Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery
 
 
 class ModelRequestData(NamedTuple):
     engine_args: EngineArgs
-    prompt: Optional[str] = None
-    image: Optional[Image] = None
-    query: Optional[str] = None
-    documents: Optional[ScoreMultiModalParam] = None
+    prompt: str | None = None
+    image: Image | None = None
+    query: str | None = None
+    documents: ScoreMultiModalParam | None = None
 
 
 def run_clip(query: Query) -> ModelRequestData:
@@ -281,7 +281,7 @@ def get_query(modality: QueryModality):
     raise ValueError(msg)
 
 
-def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
+def run_encode(model: str, modality: QueryModality, seed: int | None):
     query = get_query(modality)
     req_data = model_example_map[model](query)
 
@@ -311,7 +311,7 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
         print("-" * 50)
 
 
-def run_score(model: str, modality: QueryModality, seed: Optional[int]):
+def run_score(model: str, modality: QueryModality, seed: int | None):
     query = get_query(modality)
     req_data = model_example_map[model](query)
 
diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
index 1df11d9d84957..2b8482ec717af 100644
--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@@ -23,7 +23,7 @@ import logging
 import os
 import sys
 from abc import ABC, abstractmethod
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import aiohttp
 import requests
@@ -49,12 +49,9 @@ class Proxy:
         decode_instances: list[str],
         model: str,
         scheduling_policy: SchedulingPolicy,
-        custom_create_completion: Optional[
-            Callable[[Request], StreamingResponse]
-        ] = None,
-        custom_create_chat_completion: Optional[
-            Callable[[Request], StreamingResponse]
-        ] = None,
+        custom_create_completion: Callable[[Request], StreamingResponse] | None = None,
+        custom_create_chat_completion: Callable[[Request], StreamingResponse]
+        | None = None,
     ):
         self.prefill_instances = prefill_instances
         self.decode_instances = decode_instances
@@ -348,9 +345,9 @@ class ProxyServer:
     def __init__(
         self,
         args: argparse.Namespace,
-        scheduling_policy: Optional[SchedulingPolicy] = None,
-        create_completion: Optional[Callable[[Request], StreamingResponse]] = None,
-        create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None,
+        scheduling_policy: SchedulingPolicy | None = None,
+        create_completion: Callable[[Request], StreamingResponse] | None = None,
+        create_chat_completion: Callable[[Request], StreamingResponse] | None = None,
     ):
         self.validate_parsed_serve_args(args)
         self.port = args.port
diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py
index f4b79b5e13020..19f6bd5726102 100644
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional, Union
+from typing import Any
 
 import msgspec
 import zmq
@@ -25,16 +25,16 @@ class KVCacheEvent(
 
 class BlockStored(KVCacheEvent):
     block_hashes: list[ExternalBlockHash]
-    parent_block_hash: Optional[ExternalBlockHash]
+    parent_block_hash: ExternalBlockHash | None
     token_ids: list[int]
     block_size: int
-    lora_id: Optional[int]
-    medium: Optional[str]
+    lora_id: int | None
+    medium: str | None
 
 
 class BlockRemoved(KVCacheEvent):
     block_hashes: list[ExternalBlockHash]
-    medium: Optional[str]
+    medium: str | None
 
 
 class AllBlocksCleared(KVCacheEvent):
@@ -42,7 +42,7 @@ class AllBlocksCleared(KVCacheEvent):
 
 
 class KVEventBatch(EventBatch):
-    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+    events: list[BlockStored | BlockRemoved | AllBlocksCleared]
 
 
 def process_event(event_batch):
diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py
index cb230913a422f..b46cea5619671 100644
--- a/examples/online_serving/multi_instance_data_parallel.py
+++ b/examples/online_serving/multi_instance_data_parallel.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-from typing import Optional
 
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -43,7 +42,7 @@ async def main():
     )
 
     prompt = "Who won the 2004 World Series?"
-    final_output: Optional[RequestOutput] = None
+    final_output: RequestOutput | None = None
     async for output in engine_client.generate(
         prompt=prompt,
         sampling_params=sampling_params,
diff --git a/examples/online_serving/pooling/cohere_rerank_client.py b/examples/online_serving/pooling/cohere_rerank_client.py
index 63c9ff9e93980..b32209967be9a 100644
--- a/examples/online_serving/pooling/cohere_rerank_client.py
+++ b/examples/online_serving/pooling/cohere_rerank_client.py
@@ -8,8 +8,6 @@ Note that `pip install cohere` is needed to run this example.
 run: vllm serve BAAI/bge-reranker-base
 """
 
-from typing import Union
-
 import cohere
 from cohere import Client, ClientV2
 
@@ -25,7 +23,7 @@ documents = [
 
 
 def cohere_rerank(
-    client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
+    client: Client | ClientV2, model: str, query: str, documents: list[str]
 ) -> dict:
     return client.rerank(model=model, query=query, documents=documents)
 
diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
index 16ac4378c6863..25ab865a4ee43 100644
--- a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
@@ -9,7 +9,7 @@ Refer to each `run_*` function for the command to run the server for that model.
 import argparse
 import base64
 import io
-from typing import Literal, Union
+from typing import Literal
 
 from openai import OpenAI
 from openai._types import NOT_GIVEN, NotGiven
@@ -29,7 +29,7 @@ def create_chat_embeddings(
     *,
     messages: list[ChatCompletionMessageParam],
     model: str,
-    encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN,
+    encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN,
 ) -> CreateEmbeddingResponse:
     """
     Convenience function for accessing vLLM's Chat Embeddings API,
diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py
index 3ea6c73e90e8f..02853a95469a6 100644
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@@ -1,21 +1,15 @@
 # ruff: noqa: E501
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
 import argparse
 import asyncio
 import enum
 import os
-from typing import TYPE_CHECKING, Any, Literal
+from typing import Any, Literal
 
 import openai
 import pydantic
-
-if TYPE_CHECKING:
-    from openai.types.chat import ChatCompletionChunk
-
+from openai.types.chat import ChatCompletionChunk
 
 ConstraintsFormat = Literal[
     "choice",
diff --git a/pyproject.toml b/pyproject.toml
index 49a7a0b8b1210..eb9bdb593baac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -84,12 +84,6 @@ ignore = [
     "B007",
     # f-string format
     "UP032",
-    # Can remove once 3.10+ is the minimum Python version
-    "UP007",
-    "UP027",
-    "UP035",
-    "UP038",
-    "UP045",
 ]
 
 [tool.ruff.format]
diff --git a/tests/benchmarks/test_random_dataset.py b/tests/benchmarks/test_random_dataset.py
index 90527dbeae28c..68e4afdcbe521 100644
--- a/tests/benchmarks/test_random_dataset.py
+++ b/tests/benchmarks/test_random_dataset.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
-from typing import Any, NamedTuple, Optional, cast
+from typing import Any, NamedTuple, cast
 
 import numpy as np
 import pytest
@@ -185,8 +185,8 @@ def _collect_mm_samples(
     output_len: int = 5,
     base_items_per_request: int = 2,
     num_mm_items_range_ratio: float = 0.0,
-    limit_mm_per_prompt: Optional[dict[str, int]] = None,
-    bucket_config: Optional[dict[tuple[int, int, int], float]] = None,
+    limit_mm_per_prompt: dict[str, int] | None = None,
+    bucket_config: dict[tuple[int, int, int], float] | None = None,
     enable_multimodal_chat: bool = False,
 ) -> list[SampleRequest]:
     if limit_mm_per_prompt is None:
diff --git a/tests/ci_envs.py b/tests/ci_envs.py
index d16ecce1ef8dd..596a05b9e5f33 100644
--- a/tests/ci_envs.py
+++ b/tests/ci_envs.py
@@ -5,13 +5,14 @@ These envs only work for a small part of the tests, fix what you need!
 """
 
 import os
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
     VLLM_CI_NO_SKIP: bool = False
-    VLLM_CI_DTYPE: Optional[str] = None
-    VLLM_CI_HEAD_DTYPE: Optional[str] = None
-    VLLM_CI_HF_DTYPE: Optional[str] = None
+    VLLM_CI_DTYPE: str | None = None
+    VLLM_CI_HEAD_DTYPE: str | None = None
+    VLLM_CI_HF_DTYPE: str | None = None
 
 environment_variables: dict[str, Callable[[], Any]] = {
     # A model family has many models with the same architecture.
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index 36bc832a1329e..ef1fdd4f9daef 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -2,9 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import weakref
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from copy import deepcopy
-from typing import Callable, Union
 
 from torch import fx
 from torch._ops import OpOverload
@@ -44,7 +43,7 @@ class TestBackend:
     Inductor config is default-initialized from VllmConfig.CompilationConfig.
     """
 
-    def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], None]]):
+    def __init__(self, *passes: InductorPass | Callable[[fx.Graph], None]):
         self.custom_passes = list(passes)
         compile_config = get_current_vllm_config().compilation_config
         self.inductor_config = compile_config.inductor_compile_config
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 08f59283a6db5..45317b456af48 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -10,7 +10,7 @@ initialized randomly with a fixed seed.
 """
 
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 
 import pytest
 import torch
@@ -162,7 +162,7 @@ class LlamaDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         For tractable computation:
@@ -217,7 +217,7 @@ class LlamaModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
     ) -> torch.Tensor:
         hidden_states = self.embedding_tokens(input_ids)
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 4bcefb30b2e6e..9bfd72260436b 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import dataclasses
 
 import pytest
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 8ccae4cfb9df2..2f3794c90b204 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,11 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import logging
 import tempfile
-from typing import Any, Union
+from typing import Any
 
 import pytest
 import torch
@@ -217,7 +215,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm):
 
 
 def run_model(
-    compile_config: Union[int, CompilationConfig],
+    compile_config: int | CompilationConfig,
     model: str,
     model_kwargs: dict[str, Any],
 ):
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 0f2e3bffbd311..d1ab85cfb875c 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
-from typing import Optional
 
 import pytest
 import torch._dynamo
@@ -41,8 +40,8 @@ FP8_DTYPE = current_platform.fp8_dtype()
 FP4_DTYPE = torch.uint8
 
 # globals needed for string-import custom Dynamo backend field
-backend: Optional[TestBackend] = None
-backend_unfused: Optional[TestBackend] = None
+backend: TestBackend | None = None
+backend_unfused: TestBackend | None = None
 
 
 class AttentionQuantPatternModel(torch.nn.Module):
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index 34db5a999cbd8..b2fff822bbbb5 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -10,7 +9,7 @@ from vllm.config import CompilationLevel
 
 
 class MyMod(torch.nn.Module):
-    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+    def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
         if cache is not None:
             return x + cache
         return x * 2
@@ -24,11 +23,11 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
             compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE
         )
 
-    def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+    def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
         # this is the function to be compiled
         return self.model(x, cache)
 
-    def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
+    def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
         # let torch.compile compile twice
         if len(self.compiled_codes) == 2:
             dispatch_id = 0 if cache is None else 1
diff --git a/tests/conftest.py b/tests/conftest.py
index 4713e12385965..2fde7f97836d6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,7 +21,7 @@ import threading
 from collections.abc import Generator
 from contextlib import nullcontext
 from enum import Enum
-from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
+from typing import Any, Callable, TypedDict, TypeVar, cast
 
 import numpy as np
 import pytest
@@ -68,7 +68,7 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
 
 _M = TypeVar("_M")
 
-_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
+_PromptMultiModalInput = list[_M] | list[list[_M]]
 
 PromptImageInput = _PromptMultiModalInput[Image.Image]
 PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
@@ -267,7 +267,7 @@ class HfRunner:
 
         return "cpu" if current_platform.is_cpu() else current_platform.device_type
 
-    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+    def wrap_device(self, x: _T, device: str | None = None) -> _T:
         if x is None or isinstance(x, (bool,)):
             return x
 
@@ -287,14 +287,14 @@ class HfRunner:
         model_name: str,
         dtype: str = "auto",
         *,
-        model_kwargs: Optional[dict[str, Any]] = None,
+        model_kwargs: dict[str, Any] | None = None,
         trust_remote_code: bool = True,
         is_sentence_transformer: bool = False,
         is_cross_encoder: bool = False,
         skip_tokenizer_init: bool = False,
         auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
         # Set this to avoid hanging issue
-        default_torch_num_threads: Optional[int] = None,
+        default_torch_num_threads: int | None = None,
     ) -> None:
         init_ctx = (
             nullcontext()
@@ -319,7 +319,7 @@ class HfRunner:
         model_name: str,
         dtype: str = "auto",
         *,
-        model_kwargs: Optional[dict[str, Any]] = None,
+        model_kwargs: dict[str, Any] | None = None,
         trust_remote_code: bool = True,
         is_sentence_transformer: bool = False,
         is_cross_encoder: bool = False,
@@ -406,11 +406,11 @@ class HfRunner:
 
     def get_inputs(
         self,
-        prompts: Union[list[str], list[list[int]]],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-    ) -> list[Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]]:
+        prompts: list[str] | list[list[int]],
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
+    ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]:
         if images is not None:
             assert len(prompts) == len(images)
 
@@ -420,9 +420,7 @@ class HfRunner:
         if audios is not None:
             assert len(prompts) == len(audios)
 
-        all_inputs: list[
-            Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]
-        ] = []
+        all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = []
         for i, prompt in enumerate(prompts):
             if isinstance(prompt, str):
                 processor_kwargs: dict[str, Any] = {
@@ -494,10 +492,10 @@ class HfRunner:
 
     def generate(
         self,
-        prompts: Union[list[str], list[list[int]]],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        prompts: list[str] | list[list[int]],
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
         **kwargs: Any,
     ) -> list[tuple[list[list[int]], list[str]]]:
         all_inputs = self.get_inputs(
@@ -522,11 +520,11 @@ class HfRunner:
 
     def generate_greedy(
         self,
-        prompts: Union[list[str], list[list[int]]],
+        prompts: list[str] | list[list[int]],
         max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
         **kwargs: Any,
     ) -> list[tuple[list[int], str]]:
         outputs = self.generate(
@@ -546,9 +544,9 @@ class HfRunner:
         prompts: list[str],
         beam_width: int,
         max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
         outputs = self.generate(
             prompts,
@@ -574,9 +572,9 @@ class HfRunner:
         self,
         prompts: list[str],
         max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
         **kwargs: Any,
     ) -> list[list[torch.Tensor]]:
         all_inputs = self.get_inputs(
@@ -624,7 +622,7 @@ class HfRunner:
     def _hidden_states_to_logprobs(
         self,
         hidden_states: tuple[tuple[torch.Tensor, ...], ...],
-        num_logprobs: Optional[int],
+        num_logprobs: int | None,
     ) -> tuple[list[dict[int, float]], int]:
         seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
         output_len = len(hidden_states)
@@ -652,10 +650,10 @@ class HfRunner:
         self,
         prompts: list[str],
         max_tokens: int,
-        num_logprobs: Optional[int],
-        images: Optional[PromptImageInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[PromptVideoInput] = None,
+        num_logprobs: int | None,
+        images: PromptImageInput | None = None,
+        audios: PromptAudioInput | None = None,
+        videos: PromptVideoInput | None = None,
         **kwargs: Any,
     ) -> list[TokensTextLogprobs]:
         all_inputs = self.get_inputs(
@@ -734,20 +732,20 @@ class VllmRunner:
         model_name: str,
         runner: RunnerOption = "auto",
         convert: ConvertOption = "auto",
-        tokenizer_name: Optional[str] = None,
+        tokenizer_name: str | None = None,
         tokenizer_mode: str = "auto",
         trust_remote_code: bool = True,
-        seed: Optional[int] = 0,
-        max_model_len: Optional[int] = 1024,
+        seed: int | None = 0,
+        max_model_len: int | None = 1024,
         dtype: str = "auto",
         disable_log_stats: bool = True,
         tensor_parallel_size: int = 1,
         block_size: int = 16 if not torch.xpu.is_available() else 64,
-        enable_chunked_prefill: Optional[bool] = False,
+        enable_chunked_prefill: bool | None = False,
         swap_space: int = 4,
-        enforce_eager: Optional[bool] = False,
+        enforce_eager: bool | None = False,
         # Set this to avoid hanging issue
-        default_torch_num_threads: Optional[int] = None,
+        default_torch_num_threads: int | None = None,
         **kwargs,
     ) -> None:
         init_ctx = (
@@ -785,10 +783,10 @@ class VllmRunner:
 
     def get_inputs(
         self,
-        prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        prompts: list[str] | list[torch.Tensor] | list[list[int]],
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
     ) -> list[dict[str, Any]]:
         if any(
             x is not None and len(x) != len(prompts) for x in [images, videos, audios]
@@ -824,11 +822,11 @@ class VllmRunner:
 
     def generate(
         self,
-        prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
+        prompts: list[str] | list[torch.Tensor] | list[list[int]],
         sampling_params: SamplingParams,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
         **kwargs: Any,
     ) -> list[tuple[list[list[int]], list[str]]]:
         inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
@@ -871,11 +869,11 @@ class VllmRunner:
         self,
         prompts: list[str],
         sampling_params: SamplingParams,
-        images: Optional[PromptImageInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[PromptVideoInput] = None,
+        images: PromptImageInput | None = None,
+        audios: PromptAudioInput | None = None,
+        videos: PromptVideoInput | None = None,
         **kwargs: Any,
-    ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
+    ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
         inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
 
         req_outputs = self.llm.generate(
@@ -894,11 +892,11 @@ class VllmRunner:
 
     def generate_greedy(
         self,
-        prompts: Union[list[str], list[torch.Tensor], list[list[int]]],
+        prompts: list[str] | list[torch.Tensor] | list[list[int]],
         max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
         **kwargs: Any,
     ) -> list[tuple[list[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
@@ -916,15 +914,15 @@ class VllmRunner:
         self,
         prompts: list[str],
         max_tokens: int,
-        num_logprobs: Optional[int],
-        num_prompt_logprobs: Optional[int] = None,
-        images: Optional[PromptImageInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        stop_token_ids: Optional[list[int]] = None,
-        stop: Optional[list[str]] = None,
+        num_logprobs: int | None,
+        num_prompt_logprobs: int | None = None,
+        images: PromptImageInput | None = None,
+        audios: PromptAudioInput | None = None,
+        videos: PromptVideoInput | None = None,
+        stop_token_ids: list[int] | None = None,
+        stop: list[str] | None = None,
         **kwargs: Any,
-    ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]:
+    ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
             max_tokens=max_tokens,
@@ -957,7 +955,7 @@ class VllmRunner:
         perplexities = []
         for output in outputs:
             output = cast(TokensTextLogprobsPromptLogprobs, output)
-            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
+            token_datas = cast(list[dict[int, Logprob] | None], output[3])
             assert token_datas[0] is None
             token_log_probs = []
             for token_data in token_datas[1:]:
@@ -976,10 +974,10 @@ class VllmRunner:
         prompts: list[str],
         beam_width: int,
         max_tokens: int,
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
-        concurrency_limit: Optional[int] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
+        concurrency_limit: int | None = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
         inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
 
@@ -1002,9 +1000,9 @@ class VllmRunner:
     def embed(
         self,
         prompts: list[str],
-        images: Optional[PromptImageInput] = None,
-        videos: Optional[PromptVideoInput] = None,
-        audios: Optional[PromptAudioInput] = None,
+        images: PromptImageInput | None = None,
+        videos: PromptVideoInput | None = None,
+        audios: PromptAudioInput | None = None,
         *args,
         **kwargs,
     ) -> list[list[float]]:
@@ -1023,8 +1021,8 @@ class VllmRunner:
 
     def score(
         self,
-        text_1: Union[str, list[str]],
-        text_2: Union[str, list[str]],
+        text_1: list[str] | str,
+        text_2: list[str] | str,
         *args,
         **kwargs,
     ) -> list[float]:
@@ -1226,8 +1224,8 @@ def _find_free_port() -> int:
 class LocalAssetServer:
     address: str
     port: int
-    server: Optional[http.server.ThreadingHTTPServer]
-    thread: Optional[threading.Thread]
+    server: http.server.ThreadingHTTPServer | None
+    thread: threading.Thread | None
 
     def __init__(self, address: str = "127.0.0.1") -> None:
         self.address = address
diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py
index d59b394393e34..6b829c2610359 100644
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Optional
+from typing import Any
 
 import pytest
 
@@ -15,8 +15,8 @@ def _test_stopping(
     llm: LLM,
     expected_output: str,
     expected_reason: Any,
-    stop: Optional[list[str]] = None,
-    stop_token_ids: Optional[list[int]] = None,
+    stop: list[str] | None = None,
+    stop_token_ids: list[int] | None = None,
     include_in_output: bool = False,
 ) -> None:
     output = llm.generate(
diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py
index 47ceb45057c97..9c146a3323d90 100644
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
-from typing import Optional, Union
 
 import msgspec
 import msgspec.msgpack
@@ -78,8 +77,8 @@ class MockSubscriber:
 
     def __init__(
         self,
-        pub_endpoints: Union[str, list[str]],
-        replay_endpoints: Optional[Union[str, list[str]]] = None,
+        pub_endpoints: str | list[str],
+        replay_endpoints: str | list[str] | None = None,
         topic: str = "",
         decode_type=SampleBatch,
     ):
@@ -111,7 +110,7 @@ class MockSubscriber:
         self.last_seq = -1
         self.decoder = msgspec.msgpack.Decoder(type=decode_type)
 
-    def receive_one(self, timeout=1000) -> Union[tuple[int, SampleBatch], None]:
+    def receive_one(self, timeout=1000) -> tuple[int, SampleBatch] | None:
         """Receive a single message with timeout"""
         if not self.sub.poll(timeout):
             return None
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index c61c4584d8376..ba80ee6fb83ba 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -5,9 +5,8 @@
 Run `pytest tests/distributed/test_comm_ops.py`.
 """
 
-from __future__ import annotations
-
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import pytest
 import ray
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index 89c2c9f8badeb..149b502a85a75 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 import json
 import os
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple
 
 import pytest
 
@@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple):
 
 class CPTestOptions(NamedTuple):
     multi_node_only: bool
-    load_format: Optional[str] = None
+    load_format: str | None = None
 
 
 @dataclass
@@ -54,7 +54,7 @@ class CPTestSettings:
         dcp_base: int = 1,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
     ):
         parallel_setups = []
         for eager_mode_val in [False]:
diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py
index 8a9ddcd58cfce..0228d42a76a0f 100644
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple
 
 import pytest
 
@@ -22,9 +22,9 @@ class ParallelSetup(NamedTuple):
 
 class EPTestOptions(NamedTuple):
     trust_remote_code: bool
-    tokenizer_mode: Optional[str]
-    load_format: Optional[str] = None
-    hf_overrides: Optional[str] = None
+    tokenizer_mode: str | None
+    load_format: str | None = None
+    hf_overrides: str | None = None
 
 
 @dataclass
@@ -40,9 +40,9 @@ class EPTestSettings:
         tp_base: int = 2,
         runner: RunnerOption = "auto",
         trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
-        load_format: Optional[str] = None,
-        hf_overrides: Optional[str] = None,
+        tokenizer_mode: str | None = None,
+        load_format: str | None = None,
+        hf_overrides: str | None = None,
     ):
         return EPTestSettings(
             parallel_setups=[
@@ -72,9 +72,9 @@ class EPTestSettings:
         tp_base: int = 2,
         runner: RunnerOption = "auto",
         trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
-        load_format: Optional[str] = None,
-        hf_overrides: Optional[str] = None,
+        tokenizer_mode: str | None = None,
+        load_format: str | None = None,
+        hf_overrides: str | None = None,
     ):
         return EPTestSettings(
             parallel_setups=[
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 43f0c9dd1a85a..24f62cff299a0 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 import json
 import os
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple
 
 import pytest
 
@@ -35,7 +35,7 @@ class ParallelSetup(NamedTuple):
 
 class PPTestOptions(NamedTuple):
     multi_node_only: bool
-    load_format: Optional[str] = None
+    load_format: str | None = None
 
 
 @dataclass
@@ -52,7 +52,7 @@ class PPTestSettings:
         pp_base: int = 2,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
     ):
         return PPTestSettings(
             parallel_setups=[
@@ -76,7 +76,7 @@ class PPTestSettings:
         pp_base: int = 2,
         runner: RunnerOption = "auto",
         multi_node_only: bool = False,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
     ):
         return PPTestSettings(
             parallel_setups=[
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 2c9f474640088..2f2b43cb4cc2b 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -1,16 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
 import pytest
+from typing_extensions import LiteralString
 
 from ..utils import compare_two_settings, create_new_process_for_each_test
 
-if TYPE_CHECKING:
-    from typing_extensions import LiteralString
-
 
 @pytest.mark.parametrize(
     "PP_SIZE, MODEL_NAME",
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index 0847687cf2f9a..a431bf30fc890 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 import json
 import os
 from dataclasses import dataclass
-from typing import Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple
 
 import pytest
 
@@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple):
 
 class SPTestOptions(NamedTuple):
     multi_node_only: bool
-    load_format: Optional[str] = None
+    load_format: str | None = None
 
 
 @dataclass
@@ -53,7 +53,7 @@ class SPTestSettings:
         pp_base: int = 1,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
     ):
         parallel_setups = []
         for eager_mode_val in [False, True]:
@@ -84,7 +84,7 @@ class SPTestSettings:
         pp_base: int = 1,
         runner: RunnerOption = "auto",
         multi_node_only: bool = False,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
     ):
         parallel_setups = []
         for eager_mode_val in [False, True]:
@@ -115,7 +115,7 @@ class SPTestSettings:
         pp_base: int = 1,
         runner: RunnerOption = "auto",
         multi_node_only: bool = False,
-        load_format: Optional[str] = None,
+        load_format: str | None = None,
     ):
         parallel_setups = []
         for fusion_val in [False, True]:
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 9d367349fc2e5..78928a53942f9 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -5,7 +5,7 @@ import json
 from argparse import ArgumentError
 from contextlib import nullcontext
 from dataclasses import dataclass, field
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import pytest
 
@@ -115,9 +115,9 @@ class NestedConfig:
 class DummyConfig:
     regular_bool: bool = True
     """Regular bool with default True"""
-    optional_bool: Optional[bool] = None
+    optional_bool: bool | None = None
     """Optional bool with default None"""
-    optional_literal: Optional[Literal["x", "y"]] = None
+    optional_literal: Literal["x", "y"] | None = None
     """Optional literal with default None"""
     tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3))
     """Tuple with variable length"""
@@ -127,7 +127,7 @@ class DummyConfig:
     """List with variable length"""
     list_literal: list[Literal[1, 2]] = field(default_factory=list)
     """List with literal choices"""
-    list_union: list[Union[str, type[object]]] = field(default_factory=list)
+    list_union: list[str | type[object]] = field(default_factory=list)
     """List with union type"""
     literal_literal: Literal[Literal[1], Literal[2]] = 1
     """Literal of literals with default 1"""
@@ -152,11 +152,11 @@ def test_is_not_builtin(type_hint, expected):
     ("type_hint", "expected"),
     [
         (Annotated[int, "annotation"], {int}),
-        (Optional[int], {int, type(None)}),
-        (Annotated[Optional[int], "annotation"], {int, type(None)}),
-        (Optional[Annotated[int, "annotation"]], {int, type(None)}),
+        (int | None, {int, type(None)}),
+        (Annotated[int | None, "annotation"], {int, type(None)}),
+        (Annotated[int, "annotation"] | None, {int, type(None)}),
     ],
-    ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"],
+    ids=["Annotated", "or_None", "Annotated_or_None", "or_None_Annotated"],
 )
 def test_get_type_hints(type_hint, expected):
     assert get_type_hints(type_hint) == expected
diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py
index 5df859df42da7..682420a83a442 100644
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@@ -3,7 +3,7 @@
 
 import asyncio
 import random
-from typing import Callable
+from collections.abc import Callable
 
 import openai
 import pytest
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 14181c6b8b16b..fa8ae55d14a23 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -3,7 +3,6 @@
 
 # imports for structured outputs tests
 import json
-from typing import Optional
 
 import jsonschema
 import openai  # use the official client for correctness check
@@ -176,7 +175,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: st
     [(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)],
 )
 async def test_prompt_logprobs_chat(
-    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]
+    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
 ):
     params: dict = {
         "messages": [
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index e64f68cad7c83..44d4176655375 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import datetime
-from typing import Union
 
 import openai  # use the official client for correctness check
 import pytest
@@ -166,7 +165,7 @@ async def test_function_tool_use(
     client: openai.AsyncOpenAI,
     model_name: str,
     stream: bool,
-    tool_choice: Union[str, dict],
+    tool_choice: str | dict,
     enable_thinking: bool,
 ):
     if not stream:
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index aa4ee603647e4..a85418d5b5f4e 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -4,7 +4,6 @@
 from contextlib import suppress
 from dataclasses import dataclass, field
 from http import HTTPStatus
-from typing import Optional
 from unittest.mock import AsyncMock, MagicMock
 
 import pytest
@@ -38,13 +37,13 @@ class MockModelConfig:
     trust_remote_code: bool = False
     tokenizer_mode: str = "auto"
     max_model_len: int = 100
-    tokenizer_revision: Optional[str] = None
+    tokenizer_revision: str | None = None
     multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
     hf_config: MockHFConfig = field(default_factory=MockHFConfig)
-    logits_processor_pattern: Optional[str] = None
-    diff_sampling_param: Optional[dict] = None
+    logits_processor_pattern: str | None = None
+    diff_sampling_param: dict | None = None
     allowed_local_media_path: str = ""
-    allowed_media_domains: Optional[list[str]] = None
+    allowed_media_domains: list[str] | None = None
     encoder_config = None
     generation_config: str = "auto"
     skip_tokenizer_init: bool = False
@@ -56,7 +55,7 @@ class MockModelConfig:
 class MockLoRAResolver(LoRAResolver):
     async def resolve_lora(
         self, base_model_name: str, lora_name: str
-    ) -> Optional[LoRARequest]:
+    ) -> LoRARequest | None:
         if lora_name == "test-lora":
             return LoRARequest(
                 lora_name="test-lora",
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 10224dee0efe8..d1367b4eeaf62 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -1,16 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
 import asyncio
 from contextlib import suppress
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any
+from typing import Any
 from unittest.mock import AsyncMock, MagicMock
 
 import pytest
 import pytest_asyncio
+from openai import OpenAI
 
 from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
@@ -21,9 +19,6 @@ from vllm.v1.engine.async_llm import AsyncLLM
 
 from ...utils import RemoteOpenAIServer
 
-if TYPE_CHECKING:
-    from openai import OpenAI
-
 GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
 
 
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index cfa4d3584e709..7489a406224a5 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Union
 
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
@@ -84,10 +83,10 @@ class StreamingToolReconstructor:
 def run_tool_extraction(
     tool_parser: ToolParser,
     model_output: str,
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
     streaming: bool = False,
     assert_one_tool_per_delta: bool = True,
-) -> tuple[Union[str, None], list[ToolCall]]:
+) -> tuple[str | None, list[ToolCall]]:
     if streaming:
         reconstructor = run_tool_extraction_streaming(
             tool_parser,
@@ -105,7 +104,7 @@ def run_tool_extraction(
 def run_tool_extraction_nonstreaming(
     tool_parser: ToolParser,
     model_output: str,
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
 ) -> ExtractedToolCallInformation:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
     return tool_parser.extract_tool_calls(model_output, request)
@@ -114,7 +113,7 @@ def run_tool_extraction_nonstreaming(
 def run_tool_extraction_streaming(
     tool_parser: ToolParser,
     model_deltas: Iterable[str],
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
     assert_one_tool_per_delta: bool = True,
 ) -> StreamingToolReconstructor:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
diff --git a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
index 92df43d7dbdcf..ba9fb64262772 100644
--- a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
@@ -4,8 +4,6 @@
 Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
 """
 
-from typing import Optional
-
 import openai
 import pytest
 
@@ -103,14 +101,14 @@ async def test_matryoshka(
         run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions)
 
     if model_info.is_matryoshka:
-        valid_dimensions: list[Optional[int]] = [None]
+        valid_dimensions: list[int | None] = [None]
         if model_info.matryoshka_dimensions is not None:
             valid_dimensions += model_info.matryoshka_dimensions[:2]
 
         for dimensions in valid_dimensions:
             await make_request_and_correctness_test(dimensions)
 
-        invalid_dimensions: list[Optional[int]] = [-1]
+        invalid_dimensions: list[int | None] = [-1]
         if model_info.matryoshka_dimensions is not None:
             assert 5 not in model_info.matryoshka_dimensions
             invalid_dimensions.append(5)
diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py
index e548f52e1e94d..3fadbf2ef0dd0 100644
--- a/tests/entrypoints/test_api_server_process_manager.py
+++ b/tests/entrypoints/test_api_server_process_manager.py
@@ -5,7 +5,6 @@ import multiprocessing
 import socket
 import threading
 import time
-from typing import Optional
 from unittest.mock import patch
 
 import pytest
@@ -105,7 +104,7 @@ def test_wait_for_completion_or_failure(api_server_args):
         assert len(manager.processes) == 3
 
         # Create a result capture for the thread
-        result: dict[str, Optional[Exception]] = {"exception": None}
+        result: dict[str, Exception | None] = {"exception": None}
 
         def run_with_exception_capture():
             try:
@@ -218,7 +217,7 @@ def test_external_process_monitoring(api_server_args):
         assert len(manager.processes) == 3
 
         # Create a result capture for the thread
-        result: dict[str, Optional[Exception]] = {"exception": None}
+        result: dict[str, Exception | None] = {"exception": None}
 
         def run_with_exception_capture():
             try:
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index dcd196ebdd772..224b68412e60a 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -3,7 +3,7 @@
 
 import warnings
 from collections.abc import Mapping
-from typing import Literal, Optional
+from typing import Literal
 
 import pytest
 from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
@@ -152,9 +152,9 @@ def audio_url():
 
 
 def _assert_mm_data_is_image_input(
-    mm_data: Optional[MultiModalDataDict],
+    mm_data: MultiModalDataDict | None,
     image_count: int,
-    skipped_image_indices: Optional[list] = None,
+    skipped_image_indices: list | None = None,
 ) -> None:
     assert mm_data is not None
     assert set(mm_data.keys()) == {"image"}
@@ -169,9 +169,9 @@ def _assert_mm_data_is_image_input(
 
 
 def _assert_mm_uuids(
-    mm_uuids: Optional[MultiModalUUIDDict],
+    mm_uuids: MultiModalUUIDDict | None,
     media_count: int,
-    expected_uuids: list[Optional[str]],
+    expected_uuids: list[str | None],
     modality: str = "image",
 ) -> None:
     if len(expected_uuids) > 0:
@@ -193,9 +193,9 @@ MultiModalDataCounts = Mapping[ModalityType, int]
 
 
 def _assert_mm_data_inputs(
-    mm_data: Optional[MultiModalDataDict],
+    mm_data: MultiModalDataDict | None,
     data_count: MultiModalDataCounts,
-    skipped_media_indices: Optional[dict[str, list]] = None,  # modality -> list[int]
+    skipped_media_indices: dict[str, list] | None = None,  # modality -> list[int]
 ) -> None:
     assert mm_data is not None
     assert set(data_count.keys()) == (set(mm_data.keys()))
diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/test_renderer.py
index f93978c3e6e72..c811a6ba63cb5 100644
--- a/tests/entrypoints/test_renderer.py
+++ b/tests/entrypoints/test_renderer.py
@@ -3,7 +3,6 @@
 
 import io
 from dataclasses import dataclass
-from typing import Optional
 from unittest.mock import AsyncMock, MagicMock
 
 import pybase64
@@ -17,7 +16,7 @@ from vllm.inputs.data import is_embeds_prompt
 @dataclass
 class MockModelConfig:
     max_model_len: int = 100
-    encoder_config: Optional[dict] = None
+    encoder_config: dict | None = None
 
 
 class MockTokenizerResult:
diff --git a/tests/evals/gsm8k/gsm8k_eval.py b/tests/evals/gsm8k/gsm8k_eval.py
index 9edec7a78ca23..c7799607912b6 100644
--- a/tests/evals/gsm8k/gsm8k_eval.py
+++ b/tests/evals/gsm8k/gsm8k_eval.py
@@ -12,7 +12,6 @@ import json
 import os
 import time
 from collections.abc import Generator
-from typing import Optional, Union
 
 import aiohttp
 import numpy as np
@@ -23,7 +22,7 @@ from tqdm.asyncio import tqdm
 INVALID = -9999999
 
 
-def download_and_cache_file(url: str, filename: Optional[str] = None) -> str:
+def download_and_cache_file(url: str, filename: str | None = None) -> str:
     """Download and cache a file from a URL."""
     if filename is None:
         filename = os.path.join("/tmp", url.split("/")[-1])
@@ -81,9 +80,9 @@ async def call_vllm_api(
     prompt: str,
     temperature: float,
     max_tokens: int,
-    stop: Optional[list[str]] = None,
-    url: Optional[str] = None,
-    seed: Optional[int] = None,
+    stop: list[str] | None = None,
+    url: str | None = None,
+    seed: int | None = None,
 ) -> str:
     """Call vLLM's OpenAI-compatible completions endpoint."""
     data = {
@@ -112,8 +111,8 @@ def evaluate_gsm8k(
     host: str = "http://127.0.0.1",
     port: int = 8000,
     temperature: float = 0.0,
-    seed: Optional[int] = 42,
-) -> dict[str, Union[float, int]]:
+    seed: int | None = 42,
+) -> dict[str, float | int]:
     """
     Evaluate GSM8K accuracy using vLLM serve endpoint.
 
diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py
index 88b21a9b84d64..1dec46e33f22e 100644
--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
@@ -27,8 +26,8 @@ def ref_paged_attn(
     kv_lens: list[int],
     block_tables: torch.Tensor,
     scale: float,
-    sliding_window: Optional[int] = None,
-    soft_cap: Optional[float] = None,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
     num_seqs = len(query_lens)
     block_tables = block_tables.cpu().numpy()
@@ -94,12 +93,12 @@ def test_varlen_with_paged_kv(
     seq_lens: list[tuple[int, int]],
     num_heads: tuple[int, int],
     head_size: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
     num_blocks: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 16e544eb3cf9f..15cdb950a7db5 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
-from typing import Optional
 
 import pytest
 import torch
@@ -50,7 +49,7 @@ def ref_masked_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     scale: float,
-    attn_mask: Optional[torch.Tensor] = None,
+    attn_mask: torch.Tensor | None = None,
 ) -> torch.Tensor:
     attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
     if attn_mask is not None:
@@ -69,7 +68,7 @@ def ref_single_query_cached_kv_attention(
     block_tables: torch.Tensor,
     seq_lens: torch.Tensor,
     scale: float,
-    alibi_slopes: Optional[torch.Tensor],
+    alibi_slopes: torch.Tensor | None,
 ) -> None:
     num_query_heads = query.shape[1]
     num_kv_heads = value_cache.shape[1]
@@ -415,7 +414,7 @@ def ref_multi_query_kv_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     scale: float,
-    alibi_bias: Optional[list[torch.Tensor]],
+    alibi_bias: list[torch.Tensor] | None,
     dtype: torch.dtype,
 ) -> torch.Tensor:
     num_seqs = len(cu_seq_lens) - 1
diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
index 58e8bd592ba43..4295f852f95bb 100755
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
@@ -85,7 +84,7 @@ def test_cascade(
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
     num_blocks: int,
     fa_version: int,
 ) -> None:
diff --git a/tests/kernels/attention/test_cutlass_mla_decode.py b/tests/kernels/attention/test_cutlass_mla_decode.py
index dad1510ce532b..a60f4e385a893 100644
--- a/tests/kernels/attention/test_cutlass_mla_decode.py
+++ b/tests/kernels/attention/test_cutlass_mla_decode.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 import random
-from typing import Optional
 
 import pytest
 import torch
@@ -17,7 +16,7 @@ def cal_diff(
     y: torch.Tensor,
     name: str,
     use_fp8: bool = False,
-    diff_threshold: Optional[float] = None,
+    diff_threshold: float | None = None,
 ) -> None:
     x, y = x.double(), y.double()
     cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12)
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index d39f0a593ed41..18995545552ea 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
@@ -34,8 +33,8 @@ def ref_paged_attn(
     kv_lens: list[int],
     block_tables: torch.Tensor,
     scale: float,
-    sliding_window: Optional[int] = None,
-    soft_cap: Optional[float] = None,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
     num_seqs = len(query_lens)
     block_tables = block_tables.cpu().numpy()
@@ -103,11 +102,11 @@ def test_flash_attn_with_paged_kv(
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
     num_blocks: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
     fa_version: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
     torch.set_default_device("cuda")
     if not is_fa_version_supported(fa_version):
@@ -221,13 +220,13 @@ def test_varlen_with_paged_kv(
     seq_lens: list[tuple[int, int]],
     num_heads: tuple[int, int],
     head_size: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
     num_blocks: int,
     fa_version: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
     torch.set_default_device("cuda")
     if not is_fa_version_supported(fa_version):
diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index 52cd10fdc5be0..82ec2ef14e56c 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import flashinfer
 import pytest
@@ -26,8 +25,8 @@ def ref_paged_attn(
     kv_lens: list[int],
     block_tables: torch.Tensor,
     scale: float,
-    sliding_window: Optional[int] = None,
-    soft_cap: Optional[float] = None,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
     num_seqs = len(query_lens)
     block_tables = block_tables.cpu().numpy()
@@ -90,8 +89,8 @@ def test_flashinfer_decode_with_paged_kv(
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
-    sliding_window: Optional[int],
+    soft_cap: float | None,
+    sliding_window: int | None,
 ) -> None:
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
@@ -185,8 +184,8 @@ def test_flashinfer_prefill_with_paged_kv(
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
-    sliding_window: Optional[int],
+    soft_cap: float | None,
+    sliding_window: int | None,
 ) -> None:
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
@@ -288,7 +287,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
 ) -> None:
     pytest.skip("TODO: fix the accuracy issue")
     torch.set_default_device("cuda")
@@ -398,7 +397,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
 ) -> None:
     # test doesn't work for num_heads = (16,16)
     torch.set_default_device("cuda")
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 61157429ec9cc..00f06da5a47b4 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import flashinfer
 import pytest
@@ -68,9 +67,7 @@ NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
 @torch.inference_mode
 def test_flashinfer_trtllm_decode_with_baseline(
     dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
     batch_size: int,
     max_seq_lens: tuple[int, int],
     num_heads: tuple[int, int],
@@ -78,7 +75,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
     kv_layout: str,
     block_size: int,
     window_left: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
     has_sinks: bool,
 ) -> None:
     torch.set_default_device("cuda")
@@ -267,9 +264,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
 @torch.inference_mode
 def test_flashinfer_trtllm_prefill_with_baseline(
     dtype: torch.dtype,
-    quant_dtypes: tuple[
-        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
-    ],
+    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
     batch_size: int,
     max_seq_lens: tuple[int, int],
     num_heads: tuple[int, int],
@@ -277,7 +272,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     kv_layout: str,
     block_size: int,
     window_left: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
     has_sinks: bool,
 ) -> None:
     torch.set_default_device("cuda")
diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
index eb9204dfaf158..9b084f2f660b2 100644
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import pytest
 import torch
@@ -20,7 +19,7 @@ def merge_attn_states_torch(
     prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
     suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
     suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
-    output_lse: Optional[torch.Tensor] = None,  # [NUM_HEADS, NUM_TOKENS]
+    output_lse: torch.Tensor | None = None,  # [NUM_HEADS, NUM_TOKENS]
 ):
     p_lse = prefix_lse
     s_lse = suffix_lse
diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
index fba82cfdadbdf..bf4d2179af5f9 100644
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
@@ -32,8 +31,8 @@ def ref_paged_attn(
     kv_lens: list[int],
     block_tables: torch.Tensor,
     scale: float,
-    sliding_window: Optional[int] = None,
-    soft_cap: Optional[float] = None,
+    sliding_window: int | None = None,
+    soft_cap: float | None = None,
 ) -> torch.Tensor:
     num_seqs = len(query_lens)
     block_tables = block_tables.cpu().numpy()
@@ -98,12 +97,12 @@ def test_triton_unified_attn(
     seq_lens: list[tuple[int, int]],
     num_heads: tuple[int, int],
     head_size: int,
-    sliding_window: Optional[int],
+    sliding_window: int | None,
     dtype: torch.dtype,
     block_size: int,
-    soft_cap: Optional[float],
+    soft_cap: float | None,
     num_blocks: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
 ) -> None:
     torch.set_default_device("cuda")
 
diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
index 52133ec53d1d7..418c700bbf003 100644
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import pytest
 import torch
@@ -31,13 +30,13 @@ EPS = 1e-6
 ## Helpers
 
 
-def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
+def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor:
     return torch.as_tensor(x, dtype=torch.float32, device="cuda")
 
 
 def ref_rms_norm(
-    rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor]
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor | None
+) -> tuple[torch.Tensor, torch.Tensor | None]:
     if residual is not None:
         residual = residual.clone()
         out, residual = rms_norm_layer.forward_native(x, residual)
@@ -51,9 +50,9 @@ def ref_dynamic_per_token_quant(
     rms_norm_layer: RMSNorm,
     x: torch.Tensor,
     quant_dtype: torch.dtype,
-    residual: Optional[torch.Tensor],
-    scale_ub: Optional[torch.Tensor],
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    residual: torch.Tensor | None,
+    scale_ub: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     if scale_ub is not None:
         assert quant_dtype == torch.float8_e4m3fn
 
@@ -76,9 +75,9 @@ def ref_impl(
     rms_norm_layer: RMSNorm,
     x: torch.Tensor,
     quant_dtype: torch.dtype,
-    residual: Optional[torch.Tensor],
-    scale_ub: Optional[torch.Tensor],
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    residual: torch.Tensor | None,
+    scale_ub: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     return ref_dynamic_per_token_quant(
         rms_norm_layer, x, quant_dtype, residual, scale_ub
     )
@@ -88,9 +87,9 @@ def ops_dynamic_per_token_quant(
     weight: torch.Tensor,
     x: torch.Tensor,
     quant_dtype: torch.dtype,
-    residual: Optional[torch.Tensor],
-    scale_ub: Optional[torch.Tensor],
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    residual: torch.Tensor | None,
+    scale_ub: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     if residual is not None:
         residual = residual.clone()
     out, scales = ops.rms_norm_dynamic_per_token_quant(
@@ -103,9 +102,9 @@ def ops_impl(
     weight: torch.Tensor,
     x: torch.Tensor,
     quant_dtype: torch.dtype,
-    residual: Optional[torch.Tensor],
-    scale_ub: Optional[torch.Tensor],
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    residual: torch.Tensor | None,
+    scale_ub: torch.Tensor | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub)
 
 
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index 799e0a3f2a2bd..e1ddc5de067bb 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Callable
 from itertools import product
-from typing import Callable, Optional
 
 import pytest
 import torch
@@ -68,7 +68,7 @@ def test_rotary_embedding(
     seq_len: int,
     num_heads: int,
     head_size: int,
-    rotary_dim: Optional[int],
+    rotary_dim: int | None,
     dtype: torch.dtype,
     seed: int,
     device: str,
diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py
index 0a292a3e2ae70..30c64e0bd72a7 100644
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@@ -4,8 +4,6 @@
 Tests for miscellaneous utilities
 """
 
-from typing import Optional
-
 import pytest
 import torch
 
@@ -17,7 +15,7 @@ def rotary_embedding_opcheck(
     rot,
     positions: torch.Tensor,
     query: torch.Tensor,
-    key: Optional[torch.Tensor] = None,
+    key: torch.Tensor | None = None,
 ):
     cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)
 
diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py
index fea6b94481b60..d9023490d7fc2 100644
--- a/tests/kernels/mamba/test_causal_conv1d.py
+++ b/tests/kernels/mamba/test_causal_conv1d.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
@@ -19,11 +18,11 @@ from vllm.platforms import current_platform
 def causal_conv1d_ref(
     x: torch.Tensor,
     weight: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
-    initial_states: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
+    initial_states: torch.Tensor | None = None,
     return_final_states: bool = False,
-    final_states_out: Optional[torch.Tensor] = None,
-    activation: Optional[str] = "silu",
+    final_states_out: torch.Tensor | None = None,
+    activation: str | None = "silu",
 ):
     """
     x: (batch, dim, seqlen)
@@ -117,12 +116,12 @@ def causal_conv1d_update_ref(
 def causal_conv1d_opcheck_fn(
     x: torch.Tensor,
     weight: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
-    cu_seq_len: Optional[torch.Tensor] = None,
-    cache_indices: Optional[torch.Tensor] = None,
-    has_initial_state: Optional[torch.Tensor] = None,
-    conv_states: Optional[torch.Tensor] = None,
-    activation: Optional[str] = "silu",
+    bias: torch.Tensor | None = None,
+    cu_seq_len: torch.Tensor | None = None,
+    cache_indices: torch.Tensor | None = None,
+    has_initial_state: torch.Tensor | None = None,
+    conv_states: torch.Tensor | None = None,
+    activation: str | None = "silu",
     pad_slot_id: int = PAD_SLOT_ID,
 ):
     """
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index ff12d1fb9a805..94a305a063c3a 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 
@@ -35,7 +35,7 @@ from .mk_objects import (
 from .parallel_utils import ProcessGroupInfo
 
 
-def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
+def _describe_tensor(t: torch.Tensor | None, name: str) -> str:
     if t is None:
         return f"{name} : None"
     else:
@@ -44,21 +44,21 @@ def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
 
 @dataclass
 class Config:
-    Ms: Union[list[int], int]
+    Ms: list[int] | int
     K: int
     N: int
     E: int
-    topks: Union[list[int], int]
+    topks: list[int] | int
     dtype: torch.dtype
-    quant_config: Optional[TestMoEQuantConfig]
+    quant_config: TestMoEQuantConfig | None
 
     prepare_finalize_type: mk.FusedMoEPrepareAndFinalize
     fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute
 
-    fused_moe_chunk_size: Optional[int]
+    fused_moe_chunk_size: int | None
     world_size: int
 
-    torch_trace_dir_path: Optional[str] = None
+    torch_trace_dir_path: str | None = None
 
     def __post_init__(self):
         if self.quant_config is None:
@@ -93,7 +93,7 @@ class Config:
         return self.Ms
 
     @property
-    def quant_dtype(self) -> Union[torch.dtype, str, None]:
+    def quant_dtype(self) -> torch.dtype | str | None:
         assert self.quant_config is not None
         return self.quant_config.quant_dtype
 
@@ -112,7 +112,7 @@ class Config:
         return self.quant_config.per_out_ch_quant
 
     @property
-    def quant_block_shape(self) -> Optional[list[int]]:
+    def quant_block_shape(self) -> list[int] | None:
         assert self.quant_config is not None
         return self.quant_config.block_shape
 
@@ -209,7 +209,7 @@ class Config:
         info = prepare_finalize_info(self.prepare_finalize_type)
         return info.backend
 
-    def is_valid(self) -> tuple[bool, Optional[str]]:
+    def is_valid(self) -> tuple[bool, str | None]:
         # Check prepare-finalize and fused-experts compatibility
         if self.is_batched_prepare_finalize():
             if not self.is_batched_fused_experts():
@@ -280,10 +280,10 @@ class Config:
 class WeightTensors:
     w1: torch.Tensor
     w2: torch.Tensor
-    w1_scale: Optional[torch.Tensor]
-    w2_scale: Optional[torch.Tensor]
-    w1_gs: Optional[torch.Tensor] = None
-    w2_gs: Optional[torch.Tensor] = None
+    w1_scale: torch.Tensor | None
+    w2_scale: torch.Tensor | None
+    w1_gs: torch.Tensor | None = None
+    w2_gs: torch.Tensor | None = None
 
     def describe(self):
         s = ""
@@ -351,11 +351,11 @@ class WeightTensors:
 @dataclass
 class RankTensors:
     hidden_states: torch.Tensor
-    hidden_states_scale: Optional[torch.Tensor]
+    hidden_states_scale: torch.Tensor | None
 
     topk_weights: torch.Tensor
     topk_ids: torch.Tensor
-    expert_map: Optional[torch.Tensor]
+    expert_map: torch.Tensor | None
 
     def describe(self):
         s = ""
@@ -370,7 +370,7 @@ class RankTensors:
     @staticmethod
     def make_hidden_states(
         config: Config,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """
         Return hidden_states
         """
diff --git a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
index 7d555202afe6a..95db6327c4f10 100644
--- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
+++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py
@@ -4,7 +4,6 @@
 import copy
 from enum import Enum
 from itertools import product
-from typing import Optional
 
 import torch
 from tqdm import tqdm
@@ -82,7 +81,7 @@ def make_feature_matrix(csv_file_path: str):
     import pandas as pd
 
     def add_to_results(
-        config: Config, success: Result, results_df: Optional[pd.DataFrame] = None
+        config: Config, success: Result, results_df: pd.DataFrame | None = None
     ):
         config_dict = asdict(config)
         config_dict["prepare_finalize_type"] = config_dict[
@@ -121,7 +120,7 @@ def make_feature_matrix(csv_file_path: str):
         product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES)
     )
 
-    results_df: Optional[pd.DataFrame] = None
+    results_df: pd.DataFrame | None = None
     for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
         combinations
     ):
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
index 174b2d1781ae0..aa41f89cae7dc 100644
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional, Union
 
 import torch
 
@@ -43,25 +42,25 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
 @dataclass
 class TestMoEQuantConfig:
-    quant_dtype: Union[torch.dtype, str, None]
+    quant_dtype: torch.dtype | str | None
     per_out_ch_quant: bool
     per_act_token_quant: bool
-    block_shape: Optional[list[int]]
+    block_shape: list[int] | None
 
 
 @dataclass
 class PrepareFinalizeInfo:
     activation_format: mk.FusedMoEActivationFormat
-    supported_dtypes: list[Union[torch.dtype, str]]
+    supported_dtypes: list[torch.dtype | str]
     blocked_quantization_support: bool
-    backend: Optional[str]
+    backend: str | None
     supports_apply_weight_on_input: bool = True
 
 
 @dataclass
 class ExpertInfo:
     activation_format: mk.FusedMoEActivationFormat
-    supported_dtypes: list[Union[torch.dtype, str]]
+    supported_dtypes: list[torch.dtype | str]
     blocked_quantization_support: bool
     supports_chunking: bool
     supports_expert_map: bool
@@ -78,7 +77,7 @@ MK_FUSED_EXPERT_TYPES: list[mk.FusedMoEPermuteExpertsUnpermute] = []
 
 standard_format = mk.FusedMoEActivationFormat.Standard
 batched_format = mk.FusedMoEActivationFormat.BatchedExperts
-common_float_types: list[Union[torch.dtype, str]] = [
+common_float_types: list[torch.dtype | str] = [
     torch.float8_e4m3fn,
     torch.bfloat16,
     torch.float16,
@@ -92,9 +91,9 @@ fp8_types = [torch.float8_e4m3fn]
 def register_prepare_and_finalize(
     kind,
     activation_format: mk.FusedMoEActivationFormat,
-    supported_dtypes: list[Union[torch.dtype, str]],
+    supported_dtypes: list[torch.dtype | str],
     blocked_quantization_support: bool,
-    backend: Optional[str],
+    backend: str | None,
     force_multigpu: bool = False,
     supports_apply_weight_on_input: bool = True,
 ):
@@ -121,7 +120,7 @@ def register_prepare_and_finalize(
 def register_experts(
     kind,
     activation_format: mk.FusedMoEActivationFormat,
-    supported_dtypes: list[Union[torch.dtype, str]],
+    supported_dtypes: list[torch.dtype | str],
     blocked_quantization_support: bool,
     supports_chunking: bool,
     supports_expert_map: bool,
@@ -340,7 +339,7 @@ if cutlass_fp4_supported():
         supports_expert_map=False,
     )
 
-MK_QUANT_CONFIGS: list[Optional[TestMoEQuantConfig]] = [
+MK_QUANT_CONFIGS: list[TestMoEQuantConfig | None] = [
     None,
     # per-channel / per-column weights and per-tensor activations
     TestMoEQuantConfig(
@@ -395,7 +394,7 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
 
 def make_prepare_finalize(
     prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
-    backend: Optional[str],
+    backend: str | None,
     moe: FusedMoEConfig,
     quant_config: FusedMoEQuantConfig,
 ) -> mk.FusedMoEPrepareAndFinalize:
diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
index 7802129d3d48f..4aad820635ad7 100644
--- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
+++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
@@ -3,11 +3,12 @@
 import dataclasses
 import os
 import traceback
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Concatenate
 
 import torch
 from torch.multiprocessing import spawn  # pyright: ignore[reportPrivateImportUsage]
-from typing_extensions import Concatenate, ParamSpec
+from typing_extensions import ParamSpec
 
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed import init_distributed_environment, initialize_model_parallel
@@ -58,9 +59,9 @@ def _worker_parallel_launch(
     world_local_size: int,
     node_rank: int,
     init_method: str,
-    worker: Callable[Concatenate[ProcessGroupInfo, Optional[VllmConfig], Any, P], None],
-    vllm_config: Optional[VllmConfig],
-    env_dict: Optional[dict],
+    worker: Callable[Concatenate[ProcessGroupInfo, VllmConfig | None, Any, P], None],
+    vllm_config: VllmConfig | None,
+    env_dict: dict | None,
     *args: P.args,
     **kwargs: P.kwargs,
 ) -> None:
diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
index 48e5c4659b49a..a3e264c5f5e28 100644
--- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
+++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
+from collections.abc import Callable
 from itertools import product
-from typing import Any, Callable
+from typing import Any
 
 import torch
 
diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py
index fb9e5df281f1d..d83b63e187c2f 100644
--- a/tests/kernels/moe/parallel_utils.py
+++ b/tests/kernels/moe/parallel_utils.py
@@ -7,12 +7,13 @@ DeepEP test utilities
 import dataclasses
 import os
 import traceback
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Concatenate
 
 import torch
 from torch.distributed import ProcessGroup
 from torch.multiprocessing import spawn  # pyright: ignore[reportPrivateImportUsage]
-from typing_extensions import Concatenate, ParamSpec
+from typing_extensions import ParamSpec
 
 from vllm.utils import get_open_port, has_deep_ep
 
@@ -126,8 +127,8 @@ def make_deepep_ht_a2a(
     pgi: ProcessGroupInfo,
     dp_size: int,
     ht_args: DeepEPHTArgs,
-    q_dtype: Optional[torch.dtype] = None,
-    block_shape: Optional[list[int]] = None,
+    q_dtype: torch.dtype | None = None,
+    block_shape: list[int] | None = None,
 ):
     import deep_ep
 
@@ -153,8 +154,8 @@ def make_deepep_ll_a2a(
     pg: ProcessGroup,
     pgi: ProcessGroupInfo,
     deepep_ll_args: DeepEPLLArgs,
-    q_dtype: Optional[torch.dtype] = None,
-    block_shape: Optional[list[int]] = None,
+    q_dtype: torch.dtype | None = None,
+    block_shape: list[int] | None = None,
 ):
     import deep_ep
 
@@ -185,10 +186,10 @@ def make_deepep_a2a(
     pg: ProcessGroup,
     pgi: ProcessGroupInfo,
     dp_size: int,
-    deepep_ht_args: Optional[DeepEPHTArgs],
-    deepep_ll_args: Optional[DeepEPLLArgs],
-    q_dtype: Optional[torch.dtype] = None,
-    block_shape: Optional[list[int]] = None,
+    deepep_ht_args: DeepEPHTArgs | None,
+    deepep_ll_args: DeepEPLLArgs | None,
+    q_dtype: torch.dtype | None = None,
+    block_shape: list[int] | None = None,
 ):
     if deepep_ht_args is not None:
         assert deepep_ll_args is None
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 09cede3fbcc77..2dce099770f08 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Optional
 
 import pytest
 import torch
@@ -55,7 +54,7 @@ vllm_config.scheduler_config.max_model_len = 8192
 @dataclass
 class BatchedMMConfig:
     in_dtype: torch.dtype
-    quant_dtype: Optional[torch.dtype]
+    quant_dtype: torch.dtype | None
     out_dtype: torch.dtype
     num_experts: int
     max_tokens_per_expert: int
@@ -115,7 +114,7 @@ def test_batched_mm(
     K: int,
     N: int,
     dtype: torch.dtype,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
     per_act_token_quant: bool,
 ):
     current_platform.seed_everything(7)
@@ -242,7 +241,7 @@ def test_fused_moe_batched_experts(
     topk: int,
     dtype: torch.dtype,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
     input_scales: bool,
 ):
     current_platform.seed_everything(7)
diff --git a/tests/kernels/moe/test_count_expert_num_tokens.py b/tests/kernels/moe/test_count_expert_num_tokens.py
index 996a4538d1054..39138be83bccb 100644
--- a/tests/kernels/moe/test_count_expert_num_tokens.py
+++ b/tests/kernels/moe/test_count_expert_num_tokens.py
@@ -5,7 +5,6 @@ Tests compute_expert_num_tokens kernels
 """
 
 import dataclasses
-from typing import Optional
 
 import pytest
 import torch
@@ -16,7 +15,7 @@ from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens
 @dataclasses.dataclass
 class TestTensors:
     topk_ids: torch.Tensor
-    expert_map: Optional[torch.Tensor] = None
+    expert_map: torch.Tensor | None = None
 
     def to_device(self, device: str):
         self.topk_ids = self.topk_ids.to(device=device)
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index b82cea61bd4ea..4330eda251f75 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -3,7 +3,6 @@
 import copy
 import dataclasses
 from math import prod
-from typing import Optional
 
 import pytest
 import torch
@@ -85,16 +84,16 @@ class MOETensors:
 @dataclasses.dataclass
 class MOETensors8Bit(MOETensors):
     # quantized
-    a_q: Optional[torch.Tensor] = None  # a -> a_q
-    w1_q: Optional[torch.Tensor] = None  # w1 -> w1_q
-    w2_q: Optional[torch.Tensor] = None  # w2 -> w2_q
-    a_scale: Optional[torch.Tensor] = None
-    w1_scale: Optional[torch.Tensor] = None
-    w2_scale: Optional[torch.Tensor] = None
+    a_q: torch.Tensor | None = None  # a -> a_q
+    w1_q: torch.Tensor | None = None  # w1 -> w1_q
+    w2_q: torch.Tensor | None = None  # w2 -> w2_q
+    a_scale: torch.Tensor | None = None
+    w1_scale: torch.Tensor | None = None
+    w2_scale: torch.Tensor | None = None
     # dequantized
-    a_d: Optional[torch.Tensor] = None  # a -> a_q -> a_d
-    w1_d: Optional[torch.Tensor] = None  # w1 -> w1_q -> w1_d
-    w2_d: Optional[torch.Tensor] = None  # w2 -> w2_q -> w2_d
+    a_d: torch.Tensor | None = None  # a -> a_q -> a_d
+    w1_d: torch.Tensor | None = None  # w1 -> w1_q -> w1_d
+    w2_d: torch.Tensor | None = None  # w2 -> w2_q -> w2_d
 
     @staticmethod
     def make_moe_tensors_8bit(
@@ -209,7 +208,7 @@ def run_8_bit(
     topk_ids: torch.Tensor,
     per_act_token: bool,
     per_out_ch: bool,
-    num_local_experts: Optional[int] = None,
+    num_local_experts: int | None = None,
 ) -> torch.Tensor:
     assert not any(
         [
@@ -280,7 +279,7 @@ def test_cutlass_moe_8_bit_no_graph(
     per_act_token: bool,
     per_out_ch: bool,
     monkeypatch,
-    ep_size: Optional[int] = None,
+    ep_size: int | None = None,
 ):
     current_platform.seed_everything(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index e68c5bfa5946f..65cd3e110a0fa 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -7,7 +7,6 @@ fp8 block-quantized case.
 """
 
 import dataclasses
-from typing import Optional
 
 import pytest
 import torch.distributed
@@ -92,13 +91,13 @@ class TestConfig:
     block_size: list[int]
     # configs for testing low-latency kernels
     low_latency: bool
-    use_fp8_dispatch: Optional[bool] = False
+    use_fp8_dispatch: bool | None = False
 
 
 @dataclasses.dataclass
 class TestTensors:
     rank_tokens: torch.Tensor  # all ranks make this many tokens
-    rank_token_scales: Optional[torch.Tensor]
+    rank_token_scales: torch.Tensor | None
     topk: torch.Tensor
     topk_weights: torch.Tensor
     config: TestConfig
@@ -143,7 +142,7 @@ def make_ll_modular_kernel(
     max_tokens_per_rank: int,
     dp_size: int,
     hidden_size: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
     test_config: TestConfig,
     quant_config: FusedMoEQuantConfig,
 ) -> FusedMoEModularKernel:
@@ -179,7 +178,7 @@ def make_ht_modular_kernel(
     pgi: ProcessGroupInfo,
     dp_size: int,
     num_local_experts: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
     test_config: TestConfig,
     quant_config: FusedMoEQuantConfig,
 ) -> FusedMoEModularKernel:
@@ -249,8 +248,8 @@ def deepep_deepgemm_moe_impl(
     test_tensors: TestTensors,
     w1: torch.Tensor,
     w2: torch.Tensor,
-    w1_scale: Optional[torch.Tensor],
-    w2_scale: Optional[torch.Tensor],
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
 ) -> torch.Tensor:
     test_config = test_tensors.config
     num_experts = test_config.num_experts
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index a1dabea1f0c7d..527c20fe6f80b 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -5,7 +5,6 @@ Test deepep dispatch-combine logic
 """
 
 import dataclasses
-from typing import Optional, Union
 
 import pytest
 import torch.distributed
@@ -90,7 +89,7 @@ class TestConfig:
 @dataclasses.dataclass
 class TestTensors:
     rank_tokens: torch.Tensor  # all ranks make this many tokens
-    rank_token_scales: Optional[torch.Tensor]
+    rank_token_scales: torch.Tensor | None
     topk: torch.Tensor
     topk_weights: torch.Tensor
     config: TestConfig
@@ -128,12 +127,12 @@ def make_modular_kernel(
     dp_size: int,
     num_experts: int,
     num_local_experts: int,
-    q_dtype: Optional[torch.dtype],
+    q_dtype: torch.dtype | None,
     use_fp8_dispatch: bool,
     quant_config: FusedMoEQuantConfig,
 ) -> FusedMoEModularKernel:
-    ht_args: Optional[DeepEPHTArgs] = None
-    ll_args: Optional[DeepEPLLArgs] = None
+    ht_args: DeepEPHTArgs | None = None
+    ll_args: DeepEPLLArgs | None = None
 
     if low_latency_mode:
         ll_args = DeepEPLLArgs(
@@ -148,16 +147,14 @@ def make_modular_kernel(
         )
         ht_args = DeepEPHTArgs(num_local_experts=num_local_experts)
 
-    a2a: Union[DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize] = (
-        make_deepep_a2a(
-            pg=pg,
-            pgi=pgi,
-            dp_size=dp_size,
-            q_dtype=q_dtype,
-            block_shape=None,
-            deepep_ht_args=ht_args,
-            deepep_ll_args=ll_args,
-        )
+    a2a: DeepEPHTPrepareAndFinalize | DeepEPLLPrepareAndFinalize = make_deepep_a2a(
+        pg=pg,
+        pgi=pgi,
+        dp_size=dp_size,
+        q_dtype=q_dtype,
+        block_shape=None,
+        deepep_ht_args=ht_args,
+        deepep_ll_args=ll_args,
     )
 
     num_dispatchers = pgi.world_size // dp_size
@@ -184,8 +181,8 @@ def deep_ep_moe_impl(
     test_tensors: TestTensors,
     w1: torch.Tensor,
     w2: torch.Tensor,
-    w1_scale: Optional[torch.Tensor],
-    w2_scale: Optional[torch.Tensor],
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
     num_experts: int,
     use_fp8_dispatch: bool,
     per_act_token_quant: bool,
@@ -281,8 +278,8 @@ def torch_moe_impl(
     test_tensors: TestTensors,
     w1: torch.Tensor,
     w2: torch.Tensor,
-    w1_scale: Optional[torch.Tensor],
-    w2_scale: Optional[torch.Tensor],
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
     using_fp8_dispatch: bool,
     per_act_token_quant: bool,
 ):
@@ -340,8 +337,8 @@ def _deep_ep_moe(
     config: TestConfig,
     w1: torch.Tensor,
     w2: torch.Tensor,
-    w1_scale: Optional[torch.Tensor],
-    w2_scale: Optional[torch.Tensor],
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
     use_fp8_dispatch: bool,
     per_act_token_quant: bool,
 ):
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index b028e676f086f..a86185a2dc461 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -5,7 +5,7 @@ import copy
 import textwrap
 import traceback
 from itertools import product
-from typing import Any, Optional
+from typing import Any
 
 import pytest
 import torch
@@ -245,10 +245,10 @@ def test_modular_kernel_combinations_multigpu(
     n: int,
     e: int,
     dtype: torch.dtype,
-    quant_config: Optional[TestMoEQuantConfig],
+    quant_config: TestMoEQuantConfig | None,
     prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
     fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute,
-    chunk_size: Optional[int],
+    chunk_size: int | None,
     world_size: int,
     pytestconfig,
 ):
@@ -287,10 +287,10 @@ def test_modular_kernel_combinations_singlegpu(
     n: int,
     e: int,
     dtype: torch.dtype,
-    quant_config: Optional[TestMoEQuantConfig],
+    quant_config: TestMoEQuantConfig | None,
     prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
     fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute,
-    chunk_size: Optional[int],
+    chunk_size: int | None,
     world_size: int,
     pytestconfig,
 ):
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index f357d149bd071..6b391c173f0bc 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -6,7 +6,7 @@ Run `pytest tests/kernels/test_moe.py`.
 """
 
 import functools
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import pytest
 import torch
@@ -80,7 +80,7 @@ vllm_config.scheduler_config.max_model_len = 8192
 
 
 def run_moe_test(
-    baseline: Union[Callable, torch.Tensor],
+    baseline: Callable | torch.Tensor,
     moe_fn: Callable,
     a: torch.Tensor,
     w1: torch.Tensor,
@@ -88,7 +88,7 @@ def run_moe_test(
     score: torch.Tensor,
     topk: int,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     padding: bool = False,
     use_compile: bool = False,
     use_cudagraph: bool = False,
@@ -212,7 +212,7 @@ def test_fused_moe(
         score: torch.Tensor,
         topk: int,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
+        expert_map: torch.Tensor | None = None,
     ) -> torch.Tensor:
         topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
         return m_fused_moe_fn(
diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py
index f92526e749557..6f779c6950150 100644
--- a/tests/kernels/moe/test_moe_align_block_size.py
+++ b/tests/kernels/moe/test_moe_align_block_size.py
@@ -5,8 +5,6 @@
 Run `pytest tests/kernels/moe/test_moe_align_block_size.py`.
 """
 
-from typing import Optional
-
 import pytest
 import torch
 
@@ -94,7 +92,7 @@ def torch_moe_align_block_size(
     topk_ids: torch.Tensor,
     block_size: int,
     num_experts: int,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     pad_sorted_ids: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index a6214437d404a..da9fe33a1c620 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -5,8 +5,6 @@
 Run `pytest tests/kernels/test_moe_permute_unpermute.py`.
 """
 
-from typing import Optional
-
 import numpy as np
 import pytest
 import torch
@@ -34,8 +32,8 @@ def torch_permute(
     n_expert: int,
     n_local_expert: int,
     start_expert: int,
-    expert_map: Optional[torch.Tensor] = None,
-    align_block_size: Optional[int] = None,
+    expert_map: torch.Tensor | None = None,
+    align_block_size: int | None = None,
     fill_invalid_expert: int = -1,
 ) -> list[torch.Tensor]:
     n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1]
@@ -210,7 +208,7 @@ def test_moe_permute_unpermute(
     n_expert: int,
     ep_size: int,
     dtype: torch.dtype,
-    align_block_size: Optional[int],
+    align_block_size: int | None,
 ):
     if not moe_permute_unpermute_supported():
         pytest.skip("moe_permute_unpermute is not supported on this platform.")
diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py
index dceed34f35125..7a5d10a87b741 100644
--- a/tests/kernels/moe/test_ocp_mx_moe.py
+++ b/tests/kernels/moe/test_ocp_mx_moe.py
@@ -4,7 +4,6 @@
 import importlib.metadata
 from dataclasses import dataclass
 from importlib.util import find_spec
-from typing import Optional
 
 import pytest
 import torch
@@ -103,7 +102,7 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
         assert output
 
 
-def swiglu(x, alpha: float = 1.702, beta: float = 1.0, limit: Optional[float] = None):
+def swiglu(x, alpha: float = 1.702, beta: float = 1.0, limit: float | None = None):
     # Note we add an extra bias of 1 to the linear layer
     x_glu, x_linear = torch.chunk(x, 2, dim=-1)
     if limit is not None:
@@ -510,7 +509,7 @@ def test_trtllm_gen_mxfp4_fused_moe(
     hidden_size: int,
     alpha: float,
     beta: float,
-    limit: Optional[float],
+    limit: float | None,
     act_type: str,
     transpose_optimized: bool,
 ):
@@ -660,7 +659,7 @@ def test_flashinfer_cutlass_mxfp4_fused_moe(
     hidden_size: int,
     alpha: float,
     beta: float,
-    limit: Optional[float],
+    limit: float | None,
 ):
     torch.manual_seed(42)
     device = "cuda:0"
@@ -811,9 +810,9 @@ def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe(
     num_tokens: int,
     intermediate_size: int,
     hidden_size: int,
-    alpha: Optional[float],
-    beta: Optional[float],
-    limit: Optional[float],
+    alpha: float | None,
+    beta: float | None,
+    limit: float | None,
 ):
     torch.manual_seed(42)
     device = "cuda:0"
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index 4c7c6c6a4f529..ac7f3fc5e6f05 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
@@ -73,7 +72,7 @@ def pplx_cutlass_moe(
     out_dtype,
     per_act_token: bool,
     per_out_ch: bool,
-    group_name: Optional[str],
+    group_name: str | None,
 ):
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
         PplxPrepareAndFinalize,
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 223f095c0b553..e665c636fa265 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -9,7 +9,7 @@ import copy
 import itertools
 import textwrap
 import traceback
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import pytest
 import torch
@@ -89,7 +89,7 @@ def torch_prepare(
     a: torch.Tensor,
     topk_ids: torch.Tensor,
     num_experts: int,
-    max_num_tokens: Optional[int] = None,
+    max_num_tokens: int | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert topk_ids.dim() == 2
     assert topk_ids.shape[0] == a.shape[0]
@@ -214,10 +214,10 @@ def create_pplx_prepare_finalize(
     dp_size: int,
     world_size: int,
     in_dtype: torch.dtype,
-    quant_dtype: Optional[torch.dtype],
-    block_shape: Optional[list[int]],
+    quant_dtype: torch.dtype | None,
+    block_shape: list[int] | None,
     per_act_token_quant: bool,
-    group_name: Optional[str],
+    group_name: str | None,
 ):
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
         PplxPrepareAndFinalize,
@@ -274,18 +274,14 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
     return t[(r * chunk) : (r + 1) * chunk]
 
 
-def maybe_chunk_by_rank(
-    t: Optional[torch.Tensor], r: int, w: int
-) -> Optional[torch.Tensor]:
+def maybe_chunk_by_rank(t: torch.Tensor | None, r: int, w: int) -> torch.Tensor | None:
     if t is not None:
         return chunk_by_rank(t, r, w)
     else:
         return t
 
 
-def chunk_scales_by_rank(
-    t: Optional[torch.Tensor], r: int, w: int
-) -> Optional[torch.Tensor]:
+def chunk_scales_by_rank(t: torch.Tensor | None, r: int, w: int) -> torch.Tensor | None:
     if t is not None and t.numel() > 1:
         chunk = rank_chunk(t.shape[0], r, w)
         return t[(r * chunk) : (r + 1) * chunk]
@@ -293,9 +289,7 @@ def chunk_scales_by_rank(
         return t
 
 
-def chunk_scales(
-    t: Optional[torch.Tensor], start: int, end: int
-) -> Optional[torch.Tensor]:
+def chunk_scales(t: torch.Tensor | None, start: int, end: int) -> torch.Tensor | None:
     if t is not None and t.numel() > 1:
         return t[start:end]
     else:
@@ -313,10 +307,10 @@ def pplx_prepare_finalize(
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
     num_experts: int,
-    quant_dtype: Optional[torch.dtype],
-    block_shape: Optional[list[int]],
+    quant_dtype: torch.dtype | None,
+    block_shape: list[int] | None,
     per_act_token_quant: bool,
-    group_name: Optional[str],
+    group_name: str | None,
 ) -> torch.Tensor:
     assert torch.cuda.current_device() == pgi.local_rank
 
@@ -409,8 +403,8 @@ def _pplx_prepare_finalize(
     score: torch.Tensor,
     topk: torch.Tensor,
     num_experts: int,
-    quant_dtype: Optional[torch.dtype],
-    block_shape: Optional[list[int]],
+    quant_dtype: torch.dtype | None,
+    block_shape: list[int] | None,
     per_act_token_quant: bool,
     use_internode: bool,
 ):
@@ -479,7 +473,7 @@ def test_pplx_prepare_finalize_slow(
     dtype: torch.dtype,
     world_dp_size: tuple[int, int],
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
     use_internode: bool,
 ):
     if dtype == torch.float8_e4m3fn:
@@ -521,7 +515,7 @@ def test_pplx_prepare_finalize_slow(
 
 
 def pplx_moe(
-    group_name: Optional[str],
+    group_name: str | None,
     rank: int,
     world_size: int,
     dp_size: int,
@@ -530,17 +524,17 @@ def pplx_moe(
     w2: torch.Tensor,
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    quant_dtype: torch.dtype | None = None,
     per_act_token_quant=False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
     use_compile: bool = False,
     use_cudagraphs: bool = True,
-    shared_experts: Optional[torch.nn.Module] = None,
-) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    shared_experts: torch.nn.Module | None = None,
+) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
     num_tokens, hidden_dim = a.shape
     num_experts = w1.shape[0]
     topk = topk_ids.shape[1]
@@ -657,13 +651,13 @@ def _pplx_moe(
     score: torch.Tensor,
     topk: int,
     num_experts: int,
-    w1_s: Optional[torch.Tensor] = None,
-    w2_s: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    w1_s: torch.Tensor | None = None,
+    w2_s: torch.Tensor | None = None,
+    quant_dtype: torch.dtype | None = None,
     per_act_token_quant: bool = False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
     use_internode: bool = False,
-    shared_experts: Optional[torch.nn.Module] = None,
+    shared_experts: torch.nn.Module | None = None,
 ):
     try:
         if use_internode:
@@ -812,7 +806,7 @@ def test_pplx_moe_slow(
     dtype: torch.dtype,
     world_dp_size: tuple[int, int],
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
     use_internode: bool,
 ):
     current_platform.seed_everything(7)
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index 9466dacb0c111..65ce4073ad5bc 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional, Union
 
 import torch
 
@@ -27,13 +26,13 @@ def triton_moe(
     w2: torch.Tensor,
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    quant_dtype: torch.dtype | None = None,
     per_act_token_quant=False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> torch.Tensor:
     quant_config = FusedMoEQuantConfig.make(
         quant_dtype,
@@ -54,13 +53,13 @@ def batched_moe(
     w2: torch.Tensor,
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    quant_dtype: torch.dtype | None = None,
     per_act_token_quant: bool = False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> torch.Tensor:
     max_num_tokens = round_up(a.shape[0], 64)
 
@@ -94,13 +93,13 @@ def naive_batched_moe(
     w2: torch.Tensor,
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    quant_dtype: torch.dtype | None = None,
     per_act_token_quant: bool = False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> torch.Tensor:
     max_num_tokens = round_up(a.shape[0], 64)
 
@@ -129,8 +128,8 @@ def naive_batched_moe(
 
 
 def chunk_scales(
-    scales: Optional[torch.Tensor], start: int, end: int
-) -> Optional[torch.Tensor]:
+    scales: torch.Tensor | None, start: int, end: int
+) -> torch.Tensor | None:
     if scales is not None:
         if scales.numel() == 1:
             return scales
@@ -144,10 +143,10 @@ def make_quantized_test_activations(
     m: int,
     k: int,
     in_dtype: torch.dtype,
-    quant_dtype: Optional[torch.dtype] = None,
-    block_shape: Optional[list[int]] = None,
+    quant_dtype: torch.dtype | None = None,
+    block_shape: list[int] | None = None,
     per_act_token_quant: bool = False,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     a = torch.randn((E, m, k), device="cuda", dtype=in_dtype) / 10
     a_q = a
     a_scale = None
@@ -172,11 +171,11 @@ def make_quantized_test_activations(
 
 def moe_quantize_weights(
     w: torch.Tensor,
-    w_s: Optional[torch.Tensor],
-    quant_dtype: Union[torch.dtype, str, None],
+    w_s: torch.Tensor | None,
+    quant_dtype: torch.dtype | str | None,
     per_token_quant: bool,
-    block_shape: Optional[list[int]],
-) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    block_shape: list[int] | None,
+) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
     assert (
         quant_dtype == torch.float8_e4m3fn
         or quant_dtype == torch.int8
@@ -220,10 +219,10 @@ def make_test_weight(
     rows: int,
     cols: int,
     in_dtype: torch.dtype = torch.bfloat16,
-    quant_dtype: Union[torch.dtype, str, None] = None,
-    block_shape: Optional[list[int]] = None,
+    quant_dtype: torch.dtype | str | None = None,
+    block_shape: list[int] | None = None,
     per_out_ch_quant: bool = False,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
     w_16 = torch.randn((e, rows, cols), device="cuda", dtype=in_dtype) / 15
     w_gs = None
 
@@ -262,12 +261,12 @@ def make_test_weights(
     n: int,
     k: int,
     in_dtype: torch.dtype = torch.bfloat16,
-    quant_dtype: Union[torch.dtype, str, None] = None,
-    block_shape: Optional[list[int]] = None,
+    quant_dtype: torch.dtype | str | None = None,
+    block_shape: list[int] | None = None,
     per_out_ch_quant: bool = False,
 ) -> tuple[
-    tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]],
-    tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]],
+    tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None],
+    tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None],
 ]:
     return (
         make_test_weight(
@@ -295,9 +294,9 @@ def make_test_quant_config(
     n: int,
     k: int,
     in_dtype: torch.dtype,
-    quant_dtype: Union[torch.dtype, str, None] = None,
+    quant_dtype: torch.dtype | str | None = None,
     per_act_token_quant: bool = False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor, FusedMoEQuantConfig]:
     (_, w1, w1_s, w1_gs), (_, w2, w2_s, w2_gs) = make_test_weights(
         e,
@@ -310,8 +309,8 @@ def make_test_quant_config(
     )
 
     # Hacky/trivial scales for nvfp4.
-    a1_gscale: Optional[torch.Tensor] = None
-    a2_gscale: Optional[torch.Tensor] = None
+    a1_gscale: torch.Tensor | None = None
+    a2_gscale: torch.Tensor | None = None
     if quant_dtype == "nvfp4":
         a1_gscale = torch.ones((e,), device="cuda", dtype=torch.float32)
         a2_gscale = torch.ones((e,), device="cuda", dtype=torch.float32)
@@ -348,9 +347,9 @@ def fused_moe(
     score: torch.Tensor,
     topk: int,
     renormalize: bool = False,
-    quant_config: Optional[FusedMoEQuantConfig] = None,
+    quant_config: FusedMoEQuantConfig | None = None,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
 ) -> torch.Tensor:
     topk_weights, topk_ids, _ = fused_topk(
         hidden_states, score.float(), topk, renormalize
@@ -378,7 +377,7 @@ class BaselineMM(torch.nn.Module):
         self.b = b.to(dtype=torch.float32)
         self.out_dtype = out_dtype
 
-    def forward(self, a: torch.Tensor) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    def forward(self, a: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor | None]:
         return torch.mm(a.to(dtype=torch.float32), self.b).to(self.out_dtype), None
 
 
@@ -422,8 +421,8 @@ class RealMLP(torch.nn.Module):
         quant_config=None,
         reduce_results: bool = True,
         prefix: str = "",
-        w1_s: Optional[torch.Tensor] = None,
-        w2_s: Optional[torch.Tensor] = None,
+        w1_s: torch.Tensor | None = None,
+        w2_s: torch.Tensor | None = None,
     ) -> None:
         from vllm.model_executor.layers.linear import (
             MergedColumnParallelLinear,
@@ -481,7 +480,7 @@ def make_shared_experts(
     N: int,
     K: int,
     in_dtype: torch.dtype = torch.bfloat16,
-    quant_dtype: Union[torch.dtype, str, None] = None,
+    quant_dtype: torch.dtype | str | None = None,
 ) -> torch.nn.Module:
     from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index d892f2a5acc09..9d11a7ef64138 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import torch
 
@@ -15,13 +14,13 @@ ROCM_FP8FNUZ_MAX = 224.0
 FP8_DTYPE = current_platform.fp8_dtype()
 
 
-def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
+def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor:
     return torch.as_tensor(x, dtype=torch.float32, device="cuda")
 
 
 def ref_dynamic_per_token_quant(
-    x: torch.tensor, quant_dtype: torch.dtype, scale_ub: Optional[torch.tensor] = None
-) -> tuple[torch.tensor, torch.tensor]:
+    x: torch.Tensor, quant_dtype: torch.dtype, scale_ub: torch.Tensor | None = None
+) -> tuple[torch.Tensor, torch.Tensor]:
     assert quant_dtype in [torch.int8, FP8_DTYPE]
     if scale_ub is not None:
         assert quant_dtype == FP8_DTYPE
@@ -76,8 +75,8 @@ def ref_dynamic_per_token_quant(
 # ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
 # kernel
 def ref_dynamic_per_tensor_fp8_quant(
-    x: torch.tensor,
-) -> tuple[torch.tensor, torch.tensor]:
+    x: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
     fp8_traits = torch.finfo(FP8_DTYPE)
     fp8_traits_max = (
         ROCM_FP8FNUZ_MAX
@@ -250,10 +249,10 @@ def per_block_cast_to_int8(
 
 def dequant(
     t: torch.Tensor,
-    scale: Optional[torch.Tensor],
-    block_shape: Optional[list[int]],
+    scale: torch.Tensor | None,
+    block_shape: list[int] | None,
     per_act_token_quant: bool,
-    out_dtype: Optional[torch.dtype] = torch.float32,
+    out_dtype: torch.dtype | None = torch.float32,
 ) -> torch.Tensor:
     if scale is not None:
         f32 = torch.float32
@@ -267,10 +266,10 @@ def dequant(
 
 def batched_dequant(
     t: torch.Tensor,
-    scale: Optional[torch.Tensor],
-    block_shape: Optional[list[int]],
+    scale: torch.Tensor | None,
+    block_shape: list[int] | None,
     per_act_token_quant: bool,
-    out_dtype: Optional[torch.dtype] = torch.float32,
+    out_dtype: torch.dtype | None = torch.float32,
 ) -> torch.Tensor:
     if scale is not None:
         assert t.shape[0] == scale.shape[0]
@@ -289,9 +288,9 @@ def native_batched_masked_quant_matmul(
     B: torch.Tensor,
     C: torch.Tensor,
     num_expert_tokens: torch.Tensor,
-    A_scale: Optional[torch.Tensor] = None,
-    B_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
+    A_scale: torch.Tensor | None = None,
+    B_scale: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
     per_act_token_quant: bool = False,
 ) -> torch.Tensor:
     num_expert_tokens_cpu = num_expert_tokens.clone()
diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py
index a3d524fe90ed0..465e24fd7eb97 100644
--- a/tests/kernels/quantization/test_cutlass_w4a8.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
@@ -6,7 +6,6 @@ Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`.
 """
 
 from dataclasses import dataclass
-from typing import Optional
 
 import pytest
 import torch
@@ -60,10 +59,10 @@ SCHEDULES = [
 class TypeConfig:
     act_type: torch.dtype
     weight_type: ScalarType
-    output_type: Optional[torch.dtype]
-    group_scale_type: Optional[torch.dtype]
-    channel_scale_type: Optional[torch.dtype]
-    token_scale_type: Optional[torch.dtype]
+    output_type: torch.dtype | None
+    group_scale_type: torch.dtype | None
+    channel_scale_type: torch.dtype | None
+    token_scale_type: torch.dtype | None
 
 
 @dataclass
@@ -80,7 +79,7 @@ class Tensors:
 # (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
 #  Ch Scales Type, Tok Scales Type)
 TestTypeTuple = tuple[
-    list[torch.dtype], ScalarType, Optional[torch.dtype], Optional[torch.dtype], bool
+    list[torch.dtype], ScalarType, torch.dtype | None, torch.dtype | None, bool
 ]
 TEST_TYPES = [
     *(
@@ -116,8 +115,8 @@ def cutlass_quantize_and_pack(
     atype: torch.dtype,
     w: torch.Tensor,
     wtype: ScalarType,
-    stype: Optional[torch.dtype],
-    group_size: Optional[int],
+    stype: torch.dtype | None,
+    group_size: int | None,
     zero_points: bool = False,
 ):
     assert wtype.is_integer(), "TODO: support floating point weights"
@@ -143,7 +142,7 @@ def cutlass_quantize_and_pack(
 
 
 def create_test_tensors(
-    shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
+    shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
 ) -> Tensors:
     m, n, k = shape
 
@@ -185,8 +184,8 @@ def create_test_tensors(
 def mm_test_helper(
     types: TypeConfig,
     tensors: Tensors,
-    group_size: Optional[int] = None,
-    schedule: Optional[str] = None,
+    group_size: int | None = None,
+    schedule: str | None = None,
 ):
     # CUTLASS upstream uses fp8 with fastaccum as reference
     # https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu#L406
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index b32523bb85d9a..efa81de158d38 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -7,7 +7,6 @@ Run `pytest tests/kernels/quantization/test_machete_mm.py`.
 
 import math
 from dataclasses import dataclass, fields
-from typing import Optional
 
 import pytest
 import torch
@@ -50,11 +49,11 @@ MNK_SHAPES = [
 class TypeConfig:
     act_type: torch.dtype
     weight_type: ScalarType
-    output_type: Optional[torch.dtype]
-    group_scale_type: Optional[torch.dtype]
-    group_zero_type: Optional[torch.dtype]
-    channel_scale_type: Optional[torch.dtype]
-    token_scale_type: Optional[torch.dtype]
+    output_type: torch.dtype | None
+    group_scale_type: torch.dtype | None
+    group_zero_type: torch.dtype | None
+    channel_scale_type: torch.dtype | None
+    token_scale_type: torch.dtype | None
 
 
 @dataclass
@@ -63,10 +62,10 @@ class Tensors:
     a_ref: torch.Tensor
     a: torch.Tensor
     w_q: torch.Tensor
-    w_g_s: Optional[torch.Tensor]
-    w_g_zp: Optional[torch.Tensor]
-    w_ch_s: Optional[torch.Tensor]
-    w_tok_s: Optional[torch.Tensor]
+    w_g_s: torch.Tensor | None
+    w_g_zp: torch.Tensor | None
+    w_ch_s: torch.Tensor | None
+    w_tok_s: torch.Tensor | None
 
 
 # (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
@@ -74,7 +73,7 @@ class Tensors:
 # NOTE: None "Scale Type" means the act type is floating point
 #       None "Output Type" means the output type is the same as the act type
 TestTypeTuple = tuple[
-    list[torch.dtype], ScalarType, Optional[torch.dtype], Optional[torch.dtype], bool
+    list[torch.dtype], ScalarType, torch.dtype | None, torch.dtype | None, bool
 ]
 TEST_TYPES = [
     # GPTQ style
@@ -139,11 +138,11 @@ def rand_data(shape, dtype=torch.float16, scale=1, offset=0):
         return torch.randint(-8, 7, shape, dtype=dtype, device="cuda")
 
 
-def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
+def maybe_convert_zeropoints(zps: torch.Tensor | None, s: torch.Tensor):
     return zps if zps is None else -1 * s * (zps.to(s.dtype))
 
 
-def group_size_valid(shape: tuple[int, int, int], group_size: Optional[int]) -> bool:
+def group_size_valid(shape: tuple[int, int, int], group_size: int | None) -> bool:
     return group_size is None or group_size == -1 or shape[2] % group_size == 0
 
 
@@ -151,8 +150,8 @@ def machete_quantize_and_pack(
     atype: torch.dtype,
     w: torch.Tensor,
     wtype: ScalarType,
-    stype: Optional[torch.dtype],
-    group_size: Optional[int],
+    stype: torch.dtype | None,
+    group_size: int | None,
     zero_points: bool = False,
 ):
     assert wtype.is_integer(), "TODO: support floating point weights"
@@ -178,8 +177,8 @@ def machete_quantize_and_pack(
 def create_test_tensors(
     shape: tuple[int, int, int],
     types: TypeConfig,
-    group_size: Optional[int],
-    subset_stride_factor: Optional[int] = None,
+    group_size: int | None,
+    subset_stride_factor: int | None = None,
 ) -> Tensors:
     m, n, k = shape
     factor = subset_stride_factor or 1
@@ -243,8 +242,8 @@ def create_test_tensors(
 def machete_mm_test_helper(
     types: TypeConfig,
     tensors: Tensors,
-    group_size: Optional[int] = None,
-    schedule: Optional[str] = None,
+    group_size: int | None = None,
+    schedule: str | None = None,
 ):
     output_ref = torch.matmul(tensors.a_ref, tensors.w_ref)
     output_ref_type = output_ref.dtype
@@ -294,7 +293,7 @@ def machete_mm_test_helper(
 @pytest.mark.parametrize("shape", MNK_SHAPES, ids=lambda x: "x".join(str(v) for v in x))
 @pytest.mark.parametrize("types", TEST_TYPES)
 def test_machete_all_schedules(shape, types: TypeConfig):
-    group_sizes: list[Optional[int]] = []
+    group_sizes: list[int | None] = []
     if types.group_scale_type is None:
         group_sizes = [None]
     else:
@@ -323,7 +322,7 @@ def test_machete_all_schedules(shape, types: TypeConfig):
 @pytest.mark.parametrize("shape", MNK_SHAPES, ids=lambda x: "x".join(str(v) for v in x))
 @pytest.mark.parametrize("types", TEST_TYPES)
 def test_machete_heuristic(shape, types: TypeConfig):
-    group_sizes: list[Optional[int]] = []
+    group_sizes: list[int | None] = []
     if types.group_scale_type is None:
         group_sizes = [None]
     else:
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index 1026332d99f89..6633a8bbd3c60 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -6,7 +6,6 @@ Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`.
 """
 
 import importlib
-from typing import Optional
 
 import pytest
 import torch
@@ -27,7 +26,7 @@ def torch_scaled_mm(
     scale_a: torch.Tensor,
     scale_b: torch.Tensor,
     out_dtype: type[torch.dtype],
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     out = torch.mm(a.to(torch.float32), b.to(torch.float32))
     out = scale_a * out
diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py
index 9f78c177a81f0..c9eca1f86d3a1 100644
--- a/tests/kernels/test_onednn.py
+++ b/tests/kernels/test_onednn.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Integration tests for FlexAttention backend vs default backend"""
 
-from typing import Optional
-
 import pytest
 import torch
 
@@ -38,8 +36,8 @@ def ref_int8_scaled_mm(
     b: torch.Tensor,
     scale_a: torch.Tensor,
     scale_b: torch.Tensor,
-    azp: Optional[torch.Tensor],
-    bias: Optional[torch.Tensor],
+    azp: torch.Tensor | None,
+    bias: torch.Tensor | None,
     output_type: torch.dtype,
 ):
     if azp is not None:
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 015424d9ee0f7..6c7ff984b4337 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -7,7 +7,7 @@ import random
 import unittest
 from collections.abc import Sequence
 from numbers import Number
-from typing import Any, NamedTuple, Optional, Union
+from typing import Any, NamedTuple
 
 import pytest
 import torch
@@ -96,10 +96,10 @@ class PackedQKVInputs(NamedTuple):
     query: torch.Tensor
     key: torch.Tensor
     value: torch.Tensor
-    q_start_loc_list: Optional[list[int]]
-    kv_start_loc_list: Optional[list[int]]
-    q_seq_lens: Optional[list[int]]
-    kv_seq_lens: Optional[list[int]]
+    q_start_loc_list: list[int] | None
+    kv_start_loc_list: list[int] | None
+    q_seq_lens: list[int] | None
+    kv_seq_lens: list[int] | None
 
 
 class PackedQKVO(NamedTuple):
@@ -115,7 +115,7 @@ class PackedQKVO(NamedTuple):
                         x head_size) known-correct attention output
     """
 
-    packed_qkv: Optional[PackedQKVInputs]
+    packed_qkv: PackedQKVInputs | None
     ideal_output: torch.Tensor
 
 
@@ -149,12 +149,12 @@ class PhaseTestParameters(NamedTuple):
     """
 
     packed_qkvo: PackedQKVO
-    kv_mmap: Optional[KVMemoryMap]
+    kv_mmap: KVMemoryMap | None
 
 
 def maybe_make_int_tensor(
-    _list: Optional[list[int]],
-    device: Union[torch.device, str],
+    _list: list[int] | None,
+    device: torch.device | str,
 ) -> torch.Tensor:
     """
     Convert Python int list to a 1D int torch.Tensor on `device`
@@ -170,8 +170,8 @@ def maybe_make_int_tensor(
 
 
 def maybe_make_long_tensor(
-    _list: Optional[list[int]],
-    device: Union[torch.device, str],
+    _list: list[int] | None,
+    device: torch.device | str,
 ) -> torch.Tensor:
     """
     Convert Python int list to a 1D long torch.Tensor on `device`
@@ -186,7 +186,7 @@ def maybe_make_long_tensor(
     )
 
 
-def maybe_max(_list: Optional[list]) -> Optional[Number]:
+def maybe_max(_list: list | None) -> Number | None:
     """
     Returns:
 
@@ -241,9 +241,9 @@ def ref_masked_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     scale: float,
-    custom_mask: Optional[torch.Tensor] = None,
-    q_seq_lens: Optional[list] = None,
-    kv_seq_lens: Optional[list] = None,
+    custom_mask: torch.Tensor | None = None,
+    q_seq_lens: list | None = None,
+    kv_seq_lens: list | None = None,
 ) -> torch.Tensor:
     """
     "Golden" masked attention reference. Supports two types of masking:
@@ -302,11 +302,11 @@ def ref_masked_attention(
 def make_qkv(
     batch_size: int,
     max_q_seq_len: int,
-    max_kv_seq_len: Optional[int],
+    max_kv_seq_len: int | None,
     num_heads: int,
     head_size: int,
-    device: Union[torch.device, str],
-    force_kv_seq_lens: Optional[list[int]] = None,
+    device: torch.device | str,
+    force_kv_seq_lens: list[int] | None = None,
     attn_type: AttentionType = AttentionType.ENCODER_DECODER,
     force_max_len: bool = False,
 ) -> tuple[QKVInputs, QKVInputs, QKVInputs]:
@@ -436,7 +436,7 @@ def make_qkv(
 
 
 def pack_tensor(
-    unpacked_tensor: torch.Tensor, seq_lens: list[int], device: Union[torch.device, str]
+    unpacked_tensor: torch.Tensor, seq_lens: list[int], device: torch.device | str
 ) -> tuple[torch.Tensor, list[int]]:
     """
     Pack a batch_size x padded_seq_len x num_heads x head_size tensor into an
@@ -470,7 +470,7 @@ def pack_tensor(
     return packed_tensor, start_loc_list
 
 
-def pack_qkv(qkv: QKVInputs, device: Union[torch.device, str]) -> PackedQKVInputs:
+def pack_qkv(qkv: QKVInputs, device: torch.device | str) -> PackedQKVInputs:
     """
     Individually pack each of Q, K and V, each with dimensions batch_size x
     padded_seq_len x num_heads x head_size, into respective number_of_tokens x
@@ -594,19 +594,19 @@ def make_alibi_bias(
 
 
 def _make_metadata_tensors(
-    seq_lens: Optional[list[int]],
-    context_lens: Optional[list[int]],
-    encoder_seq_lens: Optional[list[int]],
-    device: Union[torch.device, str],
+    seq_lens: list[int] | None,
+    context_lens: list[int] | None,
+    encoder_seq_lens: list[int] | None,
+    device: torch.device | str,
 ) -> tuple[
     torch.Tensor,
     torch.Tensor,
     Any,
     Any,
-    Optional[torch.Tensor],
+    torch.Tensor | None,
     torch.Tensor,
     torch.Tensor,
-    Optional[int],
+    int | None,
 ]:
     """
     Build scalar & tensor values required to build attention metadata structure.
@@ -678,7 +678,7 @@ def make_kv_cache(
     num_heads: int,
     head_size: int,
     block_size: int,
-    device: Union[torch.device, str],
+    device: torch.device | str,
     backend: str,
     default_val: float = 0.0,
 ) -> torch.Tensor:
@@ -726,18 +726,18 @@ def _num_tokens_to_min_blocks(num_tokens: int, block_size: int) -> int:
     return (num_tokens + block_size) // block_size
 
 
-def make_empty_slot_mapping_tensor(device: Union[torch.device, str]):
+def make_empty_slot_mapping_tensor(device: torch.device | str):
     return maybe_make_long_tensor([], device)
 
 
-def make_empty_block_tables_tensor(device: Union[torch.device, str]):
+def make_empty_block_tables_tensor(device: torch.device | str):
     return torch.tensor([], device=device)
 
 
 def split_slot_mapping(
     slot_mapping_list: torch.Tensor,
     seq_lens: list[int],
-    device: Union[torch.device, str],
+    device: torch.device | str,
 ):
     """
     Split a slot mapping into valid prefill- and decode-phase slot mappings.
@@ -799,7 +799,7 @@ def split_slot_mapping(
 def make_block_tables_slot_mapping(
     block_size: int,
     seq_lens: list[int],
-    device: Union[torch.device, str],
+    device: torch.device | str,
     block_base_addr: int = 0,
 ) -> tuple[torch.Tensor, list[int], int]:
     """
@@ -880,11 +880,11 @@ def make_block_tables_slot_mapping(
 def make_test_metadata(
     attn_backend: _Backend,
     is_prompt: bool,
-    seq_lens: Optional[list[int]],
-    decoder_test_params: Optional[PhaseTestParameters],
-    device: Union[torch.device, str],
-    encoder_test_params: Optional[PhaseTestParameters] = None,
-    cross_test_params: Optional[PhaseTestParameters] = None,
+    seq_lens: list[int] | None,
+    decoder_test_params: PhaseTestParameters | None,
+    device: torch.device | str,
+    encoder_test_params: PhaseTestParameters | None = None,
+    cross_test_params: PhaseTestParameters | None = None,
 ) -> AttentionMetadata:
     """
     Construct fake attention metadata for a given test phase
@@ -1142,16 +1142,16 @@ def torch_experts(
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
     global_num_experts: int = -1,
-    b_bias1: Optional[torch.Tensor] = None,
-    b_bias2: Optional[torch.Tensor] = None,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    quant_dtype: Optional[torch.dtype] = None,
+    b_bias1: torch.Tensor | None = None,
+    b_bias2: torch.Tensor | None = None,
+    expert_map: torch.Tensor | None = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    quant_dtype: torch.dtype | None = None,
     per_act_token_quant=False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
     apply_router_weights_on_input: bool = False,
 ) -> torch.Tensor:
     assert (
@@ -1261,10 +1261,10 @@ def torch_moe(
     w2: torch.Tensor,
     score: torch.Tensor,
     topk: int,
-    b_bias1: Optional[torch.Tensor] = None,
-    b_bias2: Optional[torch.Tensor] = None,
+    b_bias1: torch.Tensor | None = None,
+    b_bias2: torch.Tensor | None = None,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
 ) -> torch.Tensor:
     score = torch.softmax(score, dim=-1, dtype=torch.float32)
     topk_weight, topk_ids = torch.topk(score, topk)
@@ -1298,15 +1298,13 @@ def torch_moe_single(a, w, score, topk):
 # A special version of op check that has a restricted default set of test_utils
 # and a patched version of allclose that supports fp8 types.
 def opcheck(
-    op: Union[
-        torch._ops.OpOverload,
-        torch._ops.OpOverloadPacket,
-        torch._library.custom_ops.CustomOpDef,
-    ],
+    op: torch._ops.OpOverload
+    | torch._ops.OpOverloadPacket
+    | torch._library.custom_ops.CustomOpDef,
     args: tuple[Any, ...],
-    kwargs: Optional[dict[str, Any]] = None,
+    kwargs: dict[str, Any] | None = None,
     *,
-    test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
+    test_utils: str | Sequence[str] = ALL_OPCHECK_TEST_UTILS,
     raise_exception: bool = True,
     cond: bool = True,
 ) -> dict[str, str]:
@@ -1338,7 +1336,7 @@ def baseline_scaled_mm(
     scale_a: torch.Tensor,
     scale_b: torch.Tensor,
     out_dtype: type[torch.dtype],
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     # We treat N-dimensional group scaling as extended numpy-style broadcasting
     # in numpy simply stretches dimensions with an extent of 1 to match
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 695e06e7c1d63..8f18f01441932 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -4,7 +4,6 @@
 import random
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Optional
 from unittest.mock import patch
 
 import pytest
@@ -106,7 +105,7 @@ def skip_cuda_with_stage_false(request):
 
 def get_random_id_to_index(
     num_loras: int, num_slots: int, log: bool = True
-) -> list[Optional[int]]:
+) -> list[int | None]:
     """Creates a random lora_id_to_index mapping.
 
     Args:
@@ -122,7 +121,7 @@ def get_random_id_to_index(
             "num_loras must be less than or equal to num_slots."
         )
 
-    slots: list[Optional[int]] = [None] * num_slots
+    slots: list[int | None] = [None] * num_slots
     random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist()
     for lora_id, slot_idx in enumerate(random_slot_selections, start=1):
         slots[slot_idx] = lora_id
@@ -134,7 +133,7 @@ def get_random_id_to_index(
 
 
 def populate_loras(
-    id_to_index: list[Optional[int]],
+    id_to_index: list[int | None],
     layer: BaseLayerWithLoRA,
     layer_weights: torch.Tensor,
     generate_embeddings_tensor: int = 0,
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 0d9431bd7aaea..50fd63d35cded 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 import sys
-from typing import Union
 
 import vllm
 from vllm import LLM
@@ -27,7 +26,7 @@ def do_sample(
     llm: vllm.LLM,
     lora_path: str,
     lora_id: int,
-    tensorizer_config_dict: Union[dict, None] = None,
+    tensorizer_config_dict: dict | None = None,
 ) -> list[str]:
     prompts = [
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
@@ -73,9 +72,7 @@ def do_sample(
     return generated_texts
 
 
-def generate_and_test(
-    llm, sql_lora_files, tensorizer_config_dict: Union[dict, None] = None
-):
+def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = None):
     print("lora adapter created")
     print("lora 1")
     assert (
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 894263bd0ba38..1800ca107a426 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
 
 import pytest
 
@@ -20,7 +19,7 @@ class TestConfig:
     max_loras: int = 2
     max_lora_rank: int = 16
     max_model_len: int = 4096
-    mm_processor_kwargs: Optional[dict[str, int]] = None
+    mm_processor_kwargs: dict[str, int] | None = None
 
     def __post_init__(self):
         if self.mm_processor_kwargs is None:
@@ -61,7 +60,7 @@ class Qwen2VLTester:
         self,
         images: list[ImageAsset],
         expected_outputs: list[str],
-        lora_id: Optional[int] = None,
+        lora_id: int | None = None,
         temperature: float = 0,
         max_tokens: int = 5,
     ):
@@ -92,7 +91,7 @@ class Qwen2VLTester:
         self,
         images: list[ImageAsset],
         expected_outputs: list[list[str]],
-        lora_id: Optional[int] = None,
+        lora_id: int | None = None,
         temperature: float = 0,
         beam_width: int = 2,
         max_tokens: int = 5,
diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py
index c70e58a375c78..9b5dedc4327fb 100644
--- a/tests/lora/test_resolver.py
+++ b/tests/lora/test_resolver.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 
@@ -14,7 +13,7 @@ class DummyLoRAResolver(LoRAResolver):
 
     async def resolve_lora(
         self, base_model_name: str, lora_name: str
-    ) -> Optional[LoRARequest]:
+    ) -> LoRARequest | None:
         if lora_name == "test_lora":
             return LoRARequest(
                 lora_name=lora_name,
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index c861a52d68721..eb026c2ec0209 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections import OrderedDict
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 from unittest.mock import patch
 
 import pytest
@@ -21,7 +21,7 @@ class LoRANameParserTestConfig(NamedTuple):
     name: str
     module_name: str
     is_lora_a: bool
-    weights_mapper: Optional[WeightsMapper] = None
+    weights_mapper: WeightsMapper | None = None
 
 
 def test_parse_fine_tuned_lora_name_valid():
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index b522aa6b08743..d30b77f094665 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -4,7 +4,6 @@
 import json
 import os
 from dataclasses import dataclass
-from typing import Optional, Union
 
 import torch
 from safetensors.torch import save_file
@@ -81,7 +80,7 @@ class DummyLoRAManager:
         module_name: str,
         input_dim: int,
         output_dims: list[int],
-        noop_lora_index: Optional[list[int]] = None,
+        noop_lora_index: list[int] | None = None,
         rank: int = 8,
     ):
         base_loras: list[LoRALayerWeights] = []
@@ -113,7 +112,7 @@ def assert_close(a, b):
 @dataclass
 class PunicaTensors:
     inputs_tensor: torch.Tensor
-    lora_weights: Union[torch.Tensor, list[torch.Tensor]]
+    lora_weights: torch.Tensor | list[torch.Tensor]
     our_out_tensor: torch.Tensor
     ref_out_tensor: torch.Tensor
     b_seq_start_loc: torch.Tensor
diff --git a/tests/model_executor/model_loader/tensorizer_loader/conftest.py b/tests/model_executor/model_loader/tensorizer_loader/conftest.py
index add6d3742ff53..74724a3b398dd 100644
--- a/tests/model_executor/model_loader/tensorizer_loader/conftest.py
+++ b/tests/model_executor/model_loader/tensorizer_loader/conftest.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable
+from collections.abc import Callable
 
 import pytest
 
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 12aad4cb8da0f..bf290079469aa 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import pytest
 import torch
@@ -76,7 +75,7 @@ class Relu3(ReLUSquaredActivation):
     ],
 )
 def test_enabled_ops(
-    env: Optional[str],
+    env: str | None,
     torch_level: int,
     use_inductor: bool,
     ops_enabled: list[int],
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index b161cc7153b8f..ad37d1ad82c03 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import pytest
 import torch
@@ -138,7 +137,7 @@ def test_models(
             example_prompts, max_tokens, num_logprobs
         )
 
-        prompt_embeds: Optional[list[torch.Tensor]] = [] if use_prompt_embeds else None
+        prompt_embeds: list[torch.Tensor] | None = [] if use_prompt_embeds else None
 
         prompt_token_ids = []
         for prompt in example_prompts:
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index abedd15b0d7eb..fd2df329f17f9 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable
+from collections.abc import Callable
 
 import pytest
 
diff --git a/tests/models/language/generation_ppl_test/ppl_utils.py b/tests/models/language/generation_ppl_test/ppl_utils.py
index 43f6066b1c85e..cfa09635effc1 100644
--- a/tests/models/language/generation_ppl_test/ppl_utils.py
+++ b/tests/models/language/generation_ppl_test/ppl_utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://huggingface.co/docs/transformers/perplexity
-from typing import Optional, cast
+from typing import cast
 
 import pytest
 import torch
@@ -85,7 +85,7 @@ def wikitext_ppl_test(
         n_tokens = 0
         for output in outputs:
             output = cast(TokensTextLogprobsPromptLogprobs, output)
-            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
+            token_datas = cast(list[dict[int, Logprob] | None], output[3])
 
             assert token_datas[0] is None
             token_log_probs = []
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
index 261ab80ae86bc..4ac40656bc62a 100644
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import Optional
 
 import pytest
 
@@ -13,7 +12,7 @@ def run_embedding_correctness_test(
     hf_model: "HfRunner",
     inputs: list[str],
     vllm_outputs: Sequence[list[float]],
-    dimensions: Optional[int] = None,
+    dimensions: int | None = None,
 ):
     hf_outputs = hf_model.encode(inputs)
     if dimensions:
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index c9574dca498ee..c8deffbf66dba 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import pytest
 
@@ -66,7 +65,7 @@ def test_models(
             pooling_type="MEAN", normalize=False
         )
 
-    max_model_len: Optional[int] = 512
+    max_model_len: int | None = 512
     if model in [
         "sentence-transformers/all-MiniLM-L12-v2",
         "sentence-transformers/stsb-roberta-base-v2",
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index 14308ac06c03e..0adc9b5cf25f6 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import numpy as np
 import openai
 import pytest
diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py
index 65ad49fad3653..f2a8177377491 100644
--- a/tests/models/language/pooling_mteb_test/mteb_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_utils.py
@@ -3,7 +3,6 @@
 
 import tempfile
 from collections.abc import Sequence
-from typing import Optional
 
 import mteb
 import numpy as np
@@ -51,7 +50,7 @@ class VllmMtebEncoder(mteb.Encoder):
 
     def predict(
         self,
-        sentences: list[tuple[str, str, Optional[str]]],  # query, corpus, prompt
+        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
         *args,
         **kwargs,
     ) -> np.ndarray:
@@ -100,7 +99,7 @@ class ScoreClientMtebEncoder(mteb.Encoder):
 
     def predict(
         self,
-        sentences: list[tuple[str, str, Optional[str]]],  # query, corpus, prompt
+        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
         *args,
         **kwargs,
     ) -> np.ndarray:
@@ -294,7 +293,7 @@ def mteb_test_rerank_models_hf(
         original_predict = hf_model.predict
 
         def _predict(
-            sentences: list[tuple[str, str, Optional[str]]],  # query, corpus, prompt
+            sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
             *args,
             **kwargs,
         ):
diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
index 9e95dd74c3978..2927a37111364 100644
--- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Any
 
 import numpy as np
 import pytest
@@ -111,7 +111,7 @@ class GemmaMtebEncoder(VllmMtebEncoder):
 
     def predict(
         self,
-        sentences: list[tuple[str, str, Optional[str]]],  # query, corpus, prompt
+        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
         *args,
         **kwargs,
     ) -> np.ndarray:
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index ef08b1916aa5f..e39dfc888779e 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional
 
 import pytest
 from transformers import AutoModelForSpeechSeq2Seq
@@ -18,8 +17,8 @@ HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date:
 
 
 def vllm_to_hf_output(
-    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
-) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: tuple[list[int], str, SampleLogprobs | None],
+) -> tuple[list[int], str, SampleLogprobs | None]:
     """Sanitize hf output to be comparable with vllm output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -46,7 +45,7 @@ def run_test(
     max_tokens: int,
     num_logprobs: int,
     tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
 ):
     """Inference result should be the same between hf and vllm.
 
diff --git a/tests/models/multimodal/generation/test_phi4_multimodal.py b/tests/models/multimodal/generation/test_phi4_multimodal.py
index 132c69285c5c7..cbc7dfca0234d 100644
--- a/tests/models/multimodal/generation/test_phi4_multimodal.py
+++ b/tests/models/multimodal/generation/test_phi4_multimodal.py
@@ -3,7 +3,6 @@
 
 import os
 from collections.abc import Sequence
-from typing import Optional
 
 import librosa
 import pytest
@@ -57,7 +56,7 @@ if current_platform.is_rocm():
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
+    inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
     model: str,
     *,
     max_model_len: int,
@@ -66,7 +65,7 @@ def run_test(
     num_logprobs: int,
     mm_limit: int,
     tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
 ):
     """Inference result should be the same between hf and vllm.
 
diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
index e69d44c6a1319..5619cecc081d2 100644
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -3,7 +3,6 @@
 
 import os
 from collections.abc import Sequence
-from typing import Optional
 
 import librosa
 import pytest
@@ -48,7 +47,7 @@ models = [model_path]
 
 
 def vllm_to_hf_output(
-    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str
+    vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str
 ):
     """Sanitize vllm output to be comparable with hf output."""
     _, output_str, out_logprobs = vllm_output
@@ -79,7 +78,7 @@ if current_platform.is_rocm():
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
+    inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]],
     model: str,
     *,
     max_model_len: int,
@@ -88,7 +87,7 @@ def run_test(
     num_logprobs: int,
     mm_limit: int,
     tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
 ):
     """Inference result should be the same between hf and vllm.
 
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index bde07da9101ac..3cad2c43d5623 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 from dataclasses import asdict
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import pytest
 from mistral_common.multimodal import download_image
@@ -117,7 +117,7 @@ FIXTURE_LOGPROBS_CHAT = {
     MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
 }
 
-OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
+OutputsLogprobs = list[tuple[list[int], str, SampleLogprobs | None]]
 
 
 # For the test author to store golden output in JSON
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index a8f0ba8701850..a4abf6e405f74 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Optional, TypedDict, Union
+from typing import Any, TypedDict
 
 import numpy.typing as npt
 import pytest
@@ -83,7 +83,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
 
 
 def batch_make_image_embeddings(
-    image_batches: list[Union[Image.Image, list[Image.Image]]],
+    image_batches: list[Image.Image | list[Image.Image]],
     processor,
     llm: VllmRunner,
 ) -> list[Qwen2VLPromptImageEmbeddingInput]:
@@ -272,7 +272,7 @@ def run_embedding_input_test(
     num_logprobs: int,
     mm_limit: int,
     tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
 ):
     """Inference result should be the same between
     original image/video input and image/video embeddings input.
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index 766f09b0d3207..eca2b61e37d53 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import pytest
 
@@ -92,7 +91,7 @@ def run_test(
     model: str,
     *,
     tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
 ) -> None:
     prompt_list = PROMPTS * 10
     expected_list = EXPECTED[model] * 10
diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py
index 096931cca09f7..6252f33bdfad7 100644
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -2,9 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Helpers for building inputs that can be leveraged for different test types."""
 
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from pathlib import PosixPath
-from typing import Callable, Optional, Union
 
 import torch
 
@@ -47,9 +46,9 @@ def replace_test_placeholder(
 
 def get_model_prompts(
     base_prompts: Iterable[str],
-    img_idx_to_prompt: Optional[Callable[[int], str]],
-    video_idx_to_prompt: Optional[Callable[[int], str]],
-    audio_idx_to_prompt: Optional[Callable[[int], str]],
+    img_idx_to_prompt: Callable[[int], str] | None,
+    video_idx_to_prompt: Callable[[int], str] | None,
+    audio_idx_to_prompt: Callable[[int], str] | None,
     prompt_formatter: Callable[[str], str],
 ) -> list[str]:
     """Given a model-agnostic base prompt and test configuration for a model(s)
@@ -93,7 +92,7 @@ def build_single_image_inputs_from_test_info(
     test_info: VLMTestInfo,
     image_assets: ImageTestAssets,
     size_wrapper: ImageSizeWrapper,
-    tmp_path: Optional[PosixPath] = None,
+    tmp_path: PosixPath | None = None,
 ) -> list[PromptWithMultiModalInput]:
     if test_info.prompt_formatter is None:
         raise ValueError("Prompt formatter must be set to build single image inputs")
@@ -147,7 +146,7 @@ def build_multi_image_inputs_from_test_info(
     test_info: VLMTestInfo,
     image_assets: ImageTestAssets,
     size_wrapper: ImageSizeWrapper,
-    tmp_path: Optional[PosixPath] = None,
+    tmp_path: PosixPath | None = None,
 ) -> list[PromptWithMultiModalInput]:
     if test_info.prompt_formatter is None:
         raise ValueError("Prompt formatter must be set to build multi image inputs")
@@ -266,9 +265,7 @@ def build_video_inputs_from_test_info(
     ]
 
 
-def apply_image_size_scaling(
-    image, size: Union[float, tuple[int, int]], size_type: SizeType
-):
+def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType):
     """Applies a size scaler to one image; this can be an image size factor,
     which scales the image while maintaining the aspect ratio"""
     # Special case for embeddings; if it's a tensor, it's only valid if we
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index 5748ccc14c294..8d0e9b3eee9fd 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Core test implementation to be shared across modalities."""
 
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any
 
 import torch
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
@@ -27,21 +28,21 @@ def run_test(
     enforce_eager: bool,
     max_model_len: int,
     max_num_seqs: int,
-    hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
-    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
+    hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
+    vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None,
     auto_cls: type[_BaseAutoModelClass],
     use_tokenizer_eos: bool,
     comparator: Callable[..., None],
-    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
-    stop_str: Optional[list[str]],
+    get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None,
+    stop_str: list[str] | None,
     limit_mm_per_prompt: dict[str, int],
-    vllm_runner_kwargs: Optional[dict[str, Any]],
-    hf_model_kwargs: Optional[dict[str, Any]],
-    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
+    vllm_runner_kwargs: dict[str, Any] | None,
+    hf_model_kwargs: dict[str, Any] | None,
+    patch_hf_runner: Callable[[HfRunner], HfRunner] | None,
     runner: RunnerOption = "auto",
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
     tensor_parallel_size: int = 1,
-    vllm_embeddings: Optional[torch.Tensor] = None,
+    vllm_embeddings: torch.Tensor | None = None,
 ):
     """Modality agnostic test executor for comparing HF/vLLM outputs."""
     # In the case of embeddings, vLLM takes separate input tensors
diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index 8f2f8bba39ca2..8c9c390911bdc 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
 
-from typing import Callable
+from collections.abc import Callable
 
 from vllm.assets.image import ImageAsset
 from vllm.multimodal.image import rescale_image_size
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index e51d895772c05..d9c1d53b61c28 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -7,7 +7,6 @@ typically specific to a small subset of models.
 
 import types
 from pathlib import PosixPath
-from typing import Optional, Union
 
 import numpy as np
 import numpy.typing as npt
@@ -58,7 +57,7 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutpu
 
 def qwen_vllm_to_hf_output(
     vllm_output: RunnerOutput, model: str
-) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+) -> tuple[list[int], str, SampleLogprobs | None]:
     """Sanitize vllm output [qwen models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -69,7 +68,7 @@ def qwen_vllm_to_hf_output(
 
 def qwen2_vllm_to_hf_output(
     vllm_output: RunnerOutput, model: str
-) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+) -> tuple[list[int], str, SampleLogprobs | None]:
     """Sanitize vllm output [qwen2 models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -80,7 +79,7 @@ def qwen2_vllm_to_hf_output(
 
 def kimiv_vl_vllm_to_hf_output(
     vllm_output: RunnerOutput, model: str
-) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+) -> tuple[list[int], str, SampleLogprobs | None]:
     """Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -99,7 +98,7 @@ def llava_image_vllm_to_hf_output(
 
 def llava_video_vllm_to_hf_output(
     vllm_output: RunnerOutput, model: str
-) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+) -> tuple[list[int], str, SampleLogprobs | None]:
     config = AutoConfig.from_pretrained(model)
     mm_token_id = config.video_token_index
     return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
@@ -263,7 +262,7 @@ def get_llava_embeddings(image_assets: ImageTestAssets):
 
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
-    tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets]
+    tmp_path: PosixPath, prompt: str, assets: list[ImageAsset] | ImageTestAssets
 ) -> str:
     """Given a temporary dir path, export one or more image assets into the
     tempdir & replace its contents with the local path to the string so that
@@ -440,7 +439,7 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
             self.max_num = self.config.max_dynamic_patch
             self.image_size = self.vision_config.image_size
 
-        def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
+        def __call__(self, text: str, images: Image | list[Image], **kwargs):
             from vllm.model_executor.models.h2ovl import (
                 IMG_CONTEXT,
                 IMG_END,
@@ -499,7 +498,7 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
             self.max_num = self.config.max_dynamic_patch
             self.image_size = self.vision_config.image_size
 
-        def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
+        def __call__(self, text: str, images: Image | list[Image], **kwargs):
             from vllm.model_executor.models.skyworkr1v import (
                 IMG_CONTEXT,
                 IMG_END,
@@ -560,8 +559,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
         def __call__(
             self,
             text: str,
-            images: Union[Image, list[Image]] = None,
-            videos: Union[npt.NDArray, list[npt.NDArray]] = None,
+            images: Image | list[Image] = None,
+            videos: npt.NDArray | list[npt.NDArray] = None,
             **kwargs,
         ):
             from vllm.model_executor.models.internvl import (
@@ -650,7 +649,7 @@ def _internvl_generate(
     self,
     pixel_values: torch.FloatTensor,
     input_ids: torch.FloatTensor,
-    attention_mask: Optional[torch.LongTensor] = None,
+    attention_mask: torch.LongTensor | None = None,
     **generate_kwargs,
 ) -> torch.LongTensor:
     """Generate method for InternVL2 model without fixed use_cache."""
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 6e82f7e3306ab..fe02f71884324 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -2,10 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Types for writing multimodal model tests."""
 
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from enum import Enum
 from pathlib import PosixPath
-from typing import Any, Callable, NamedTuple, Optional, Union
+from typing import Any, NamedTuple
 
 import torch
 from pytest import MarkDecorator
@@ -52,16 +52,16 @@ VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
 
 IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
 EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)]
-RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
+RunnerOutput = tuple[list[int], str, SampleLogprobs | None]
 
 
 class PromptWithMultiModalInput(NamedTuple):
     """Holds the multimodal input for a single test case."""
 
     prompts: list[str]
-    image_data: Optional[PromptImageInput] = None
-    video_data: Optional[PromptVideoInput] = None
-    audio_data: Optional[PromptAudioInput] = None
+    image_data: PromptImageInput | None = None
+    video_data: PromptVideoInput | None = None
+    audio_data: PromptAudioInput | None = None
 
 
 class VLMTestType(Enum):
@@ -87,17 +87,17 @@ class ImageSizeWrapper(NamedTuple):
     type: SizeType
     # A size factor is a wrapper of 0+ floats,
     # while a fixed size contains an iterable of integer pairs
-    data: Union[Iterable[float], Iterable[tuple[int, int]]]
+    data: Iterable[float] | Iterable[tuple[int, int]]
 
 
 class VLMTestInfo(NamedTuple):
     """Holds the configuration for 1+ tests for one model architecture."""
 
     models: list[str]
-    test_type: Union[VLMTestType, Iterable[VLMTestType]]
+    test_type: VLMTestType | Iterable[VLMTestType]
 
     # Should be None only if this is a CUSTOM_INPUTS test
-    prompt_formatter: Optional[Callable[[str], str]] = None
+    prompt_formatter: Callable[[str], str] | None = None
     img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
     video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
     audio_idx_to_prompt: Callable[[int], str] = lambda idx: "<audio>\n"
@@ -111,9 +111,9 @@ class VLMTestInfo(NamedTuple):
 
     # Function for converting ImageAssets to image embeddings;
     # We need to define this explicitly for embedding tests
-    convert_assets_to_embeddings: Optional[
-        Callable[[ImageTestAssets], list[torch.Tensor]]
-    ] = None
+    convert_assets_to_embeddings: (
+        Callable[[ImageTestAssets], list[torch.Tensor]] | None
+    ) = None
 
     # Exposed options for vLLM runner; we change these in a several tests,
     # but the defaults are derived from VllmRunner & the engine defaults
@@ -123,25 +123,25 @@ class VLMTestInfo(NamedTuple):
     max_num_seqs: int = 256
     runner: RunnerOption = "auto"
     tensor_parallel_size: int = 1
-    vllm_runner_kwargs: Optional[dict[str, Any]] = None
+    vllm_runner_kwargs: dict[str, Any] | None = None
 
     # Optional callable which gets a list of token IDs from the model tokenizer
-    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None
+    get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None = None
     # Optional list of strings to stop generation, useful when stop tokens are
     # not special tokens in the tokenizer
-    stop_str: Optional[list[str]] = None
+    stop_str: list[str] | None = None
 
     # Exposed options for HF runner
-    hf_model_kwargs: Optional[dict[str, Any]] = None
+    hf_model_kwargs: dict[str, Any] | None = None
     # Indicates we should explicitly pass the EOS from the tokenizer
     use_tokenizer_eos: bool = False
     auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
-    patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
+    patch_hf_runner: Callable[[HfRunner], HfRunner] | None = None
 
     # Post processors that if defined, will run oun the outputs of the
     # vLLM and HF runner, respectively (useful for sanitization, etc).
-    vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
-    hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
+    vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
+    hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None = None
 
     # Consumes the output of the callables above and checks if they're equal
     comparator: Callable[..., None] = check_logprobs_close
@@ -152,7 +152,7 @@ class VLMTestInfo(NamedTuple):
     max_tokens: int = 128
     num_logprobs: int = 5
     dtype: str = "auto"
-    distributed_executor_backend: Optional[str] = None
+    distributed_executor_backend: str | None = None
     # Only expanded in video tests
     num_video_frames: int = 16
 
@@ -162,19 +162,19 @@ class VLMTestInfo(NamedTuple):
     # once per tests (much like concatenating and wrapping in one parametrize
     # call)
     image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
-    image_sizes: Optional[Iterable[Iterable[tuple[int, int]]]] = None
+    image_sizes: Iterable[Iterable[tuple[int, int]]] | None = None
 
     # Hack for updating a prompt to take into a local path; currently only used
     # for Qwen-VL, which requires encoding the image path / url into the prompt
     # for HF runner
-    prompt_path_encoder: Optional[
-        Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]], str]
-    ] = None  # noqa: E501
+    prompt_path_encoder: (
+        Callable[[PosixPath, str, list[ImageAsset] | ImageTestAssets], str] | None
+    ) = None  # noqa: E501
 
     # Allows configuring a test to run with custom inputs
-    custom_test_opts: Optional[list[CustomTestOptions]] = None
+    custom_test_opts: list[CustomTestOptions] | None = None
 
-    marks: Optional[list[MarkDecorator]] = None
+    marks: list[MarkDecorator] | None = None
 
     def get_non_parametrized_runner_kwargs(self):
         """Returns a dictionary of expandable kwargs for items that are used
@@ -207,10 +207,10 @@ class ExpandableVLMTestArgs(NamedTuple):
     max_tokens: int
     num_logprobs: int
     dtype: str
-    distributed_executor_backend: Optional[str]
+    distributed_executor_backend: str | None
     # Sizes are used for everything except for custom input tests
-    size_wrapper: Optional[ImageSizeWrapper] = None
+    size_wrapper: ImageSizeWrapper | None = None
     # Video only
-    num_video_frames: Optional[int] = None
+    num_video_frames: int | None = None
     # Custom inputs only
-    custom_test_opts: Optional[CustomTestOptions] = None
+    custom_test_opts: CustomTestOptions | None = None
diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
index 7f30b1f299ba1..ac3eb6e61723d 100644
--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable
+from collections.abc import Callable
 
 import pytest
 import torch
diff --git a/tests/models/multimodal/pooling/test_jinavl_reranker.py b/tests/models/multimodal/pooling/test_jinavl_reranker.py
index 853f56618290e..d7b33be7a0adb 100644
--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Union
 
 import pytest
 from transformers import AutoModel
@@ -32,7 +31,7 @@ def vllm_reranker(
     def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
         return {"type": "image_url", "image_url": {"url": f"{url}"}}
 
-    query: Union[list[str], ScoreMultiModalParam]
+    query: list[str] | ScoreMultiModalParam
     if query_type == "text":
         query = query_strs
     elif query_type == "image":
@@ -40,7 +39,7 @@ def vllm_reranker(
             content=[create_image_param(url) for url in query_strs]
         )
 
-    documents: Union[list[str], ScoreMultiModalParam]
+    documents: list[str] | ScoreMultiModalParam
     if doc_type == "text":
         documents = document_strs
     elif doc_type == "image":
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index d1361f336a071..23f183e1d5bba 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import partial
-from typing import Optional, Union
 
 import numpy as np
 import pytest
@@ -247,7 +246,7 @@ MM_DATA_PATCHES = {
 def _test_processing_correctness_one(
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
-    prompt: Union[str, list[int]],
+    prompt: str | list[int],
     mm_data: MultiModalDataDict,
     baseline_processor: BaseMultiModalProcessor,
     cached_processor: BaseMultiModalProcessor,
@@ -441,7 +440,7 @@ def _assert_inputs_equal(
     a: MultiModalInputs,
     b: MultiModalInputs,
     *,
-    ignore_mm_keys: Optional[set[str]] = None,
+    ignore_mm_keys: set[str] | None = None,
     msg: str = "",
 ):
     if ignore_mm_keys is None:
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index bd21d4008fa7b..1701d9dd8f011 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -3,7 +3,6 @@
 """Tests for H2OVL's multimodal preprocessing kwargs."""
 
 from collections.abc import Mapping
-from typing import Optional
 
 import pytest
 from PIL import Image
@@ -149,7 +148,7 @@ def test_processor_override(
     size_factors: list[int],
     min_dynamic_patch: int,
     max_dynamic_patch: int,
-    dynamic_image_size: Optional[bool],
+    dynamic_image_size: bool | None,
     kwargs_on_init: bool,
 ):
     mm_processor_kwargs = {
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index 6f6529cb9401a..b4994295d3a80 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -3,7 +3,6 @@
 """Tests for InternVL's multimodal preprocessing kwargs."""
 
 from collections.abc import Mapping
-from typing import Optional
 
 import pytest
 from PIL import Image
@@ -103,7 +102,7 @@ def test_processor_override(
     size_factors: list[int],
     min_dynamic_patch: int,
     max_dynamic_patch: int,
-    dynamic_image_size: Optional[bool],
+    dynamic_image_size: bool | None,
     kwargs_on_init: bool,
 ):
     mm_processor_kwargs = {
diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py
index 6ff6f396fa338..5311ab1b78c69 100644
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -3,7 +3,6 @@
 """Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
 
 from collections.abc import Mapping
-from typing import Optional
 
 import pytest
 from PIL import Image
@@ -105,7 +104,7 @@ def test_processor_override(
     size_factors: list[int],
     min_dynamic_patch: int,
     max_dynamic_patch: int,
-    dynamic_image_size: Optional[bool],
+    dynamic_image_size: bool | None,
     kwargs_on_init: bool,
 ):
     mm_processor_kwargs = {
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 6b6c53a50397b..9029f09de8c8b 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -4,7 +4,7 @@ import tempfile
 from collections.abc import Iterable
 from contextlib import contextmanager
 from functools import partial
-from typing import Any, Union
+from typing import Any, TypeAlias
 
 import numpy as np
 import pytest
@@ -55,15 +55,15 @@ REPO_ID_TO_SKIP = {
 }
 
 ImageInput = list[Image.Image]
-VideoInput = Union[
-    list[Image.Image], list[np.ndarray], list[tuple[np.ndarray, dict[str, Any]]]
-]
+VideoInput: TypeAlias = (
+    list[Image.Image] | list[np.ndarray] | list[tuple[np.ndarray, dict[str, Any]]]
+)
 AudioInput = list[tuple[np.ndarray, int]]
 
 
 def _resize_data(
-    _data: Union[Image.Image, np.ndarray], size_factor: float
-) -> Union[Image.Image, np.ndarray]:
+    _data: Image.Image | np.ndarray, size_factor: float
+) -> Image.Image | np.ndarray:
     assert size_factor <= 1, "Size factor must be less than 1"
     # Image input
     if isinstance(_data, Image.Image):
@@ -88,8 +88,8 @@ def _resize_data(
 
 
 def resize_mm_data(
-    data: Union[ImageInput, VideoInput, AudioInput], size_factors: tuple[float, ...]
-) -> Union[ImageInput, VideoInput, AudioInput]:
+    data: ImageInput | VideoInput | AudioInput, size_factors: tuple[float, ...]
+) -> ImageInput | VideoInput | AudioInput:
     size_factors = size_factors[: len(data)]
     if is_list_of(data, (Image.Image, np.ndarray, list)):
         return [_resize_data(d, s) for d, s in zip(data, size_factors)]
diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py
index c4c10832ede3a..70464cf7fb416 100644
--- a/tests/models/quantization/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import pytest
 import torch
@@ -30,7 +29,7 @@ def run_awq_test(
     max_tokens: int,
     num_logprobs: int,
     tensor_parallel_size: int,
-    distributed_executor_backend: Optional[str] = None,
+    distributed_executor_backend: str | None = None,
 ):
     images = [asset.pil_image for asset in image_assets]
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 615b03998323a..ad90229adf8a7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -3,7 +3,7 @@
 
 from collections.abc import Mapping, Set
 from dataclasses import dataclass, field
-from typing import Any, Literal, Optional
+from typing import Any, Literal
 
 import pytest
 import torch
@@ -21,29 +21,29 @@ class _HfExamplesInfo:
     extras: Mapping[str, str] = field(default_factory=dict)
     """Extra models to use for testing this architecture."""
 
-    tokenizer: Optional[str] = None
+    tokenizer: str | None = None
     """Set the tokenizer to load for this architecture."""
 
     tokenizer_mode: TokenizerMode = "auto"
     """Set the tokenizer type for this architecture."""
 
-    speculative_model: Optional[str] = None
+    speculative_model: str | None = None
     """
     The default model to use for testing this architecture, which is only used
     for speculative decoding.
     """
 
-    min_transformers_version: Optional[str] = None
+    min_transformers_version: str | None = None
     """
     The minimum version of HF Transformers that is required to run this model.
     """
 
-    max_transformers_version: Optional[str] = None
+    max_transformers_version: str | None = None
     """
     The maximum version of HF Transformers that this model runs on.
     """
 
-    transformers_version_reason: Optional[str] = None
+    transformers_version_reason: str | None = None
     """
     The reason for the minimum/maximum version requirement.
     """
@@ -82,19 +82,19 @@ class _HfExamplesInfo:
     hf_overrides: dict[str, Any] = field(default_factory=dict)
     """The ``hf_overrides`` required to load the model."""
 
-    max_model_len: Optional[int] = None
+    max_model_len: int | None = None
     """
     The maximum model length to use for this model. Some models default to a
     length that is too large to fit into memory in CI.
     """
 
-    revision: Optional[str] = None
+    revision: str | None = None
     """
     The specific revision (commit hash, tag, or branch) to use for the model.
     If not specified, the default revision will be used.
     """
 
-    max_num_seqs: Optional[int] = None
+    max_num_seqs: int | None = None
     """Maximum number of sequences to be processed in a single iteration."""
 
     use_original_num_layers: bool = False
@@ -109,7 +109,7 @@ class _HfExamplesInfo:
         on_fail: Literal["error", "skip", "return"],
         check_min_version: bool = True,
         check_max_version: bool = True,
-    ) -> Optional[str]:
+    ) -> str | None:
         """
         If the installed transformers version does not meet the requirements,
         perform the given action.
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index b434c0955be7e..f9e252a23ba7a 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Test the functionality of the Transformers backend."""
 
-from typing import Any, Optional, Union
+from typing import Any
 
 import pytest
 
@@ -21,12 +21,12 @@ def get_model(arch: str) -> str:
 
 
 def check_implementation(
-    runner_ref: type[Union[HfRunner, VllmRunner]],
+    runner_ref: type[HfRunner | VllmRunner],
     runner_test: type[VllmRunner],
     example_prompts: list[str],
     model: str,
-    kwargs_ref: Optional[dict[str, Any]] = None,
-    kwargs_test: Optional[dict[str, Any]] = None,
+    kwargs_ref: dict[str, Any] | None = None,
+    kwargs_test: dict[str, Any] | None = None,
     **kwargs,
 ):
     if kwargs_ref is None:
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 3d6e6cb89d62a..82da4aa649215 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -4,7 +4,7 @@
 import warnings
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -57,7 +57,7 @@ def check_outputs_equal(
 #
 # Assumes prompt logprobs were not requested.
 TokensTextLogprobs = tuple[
-    list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]]
+    list[int], str, list[dict[int, float]] | SampleLogprobs | None
 ]
 
 # Allow for tokens to be represented as str's rather than IDs;
@@ -68,7 +68,7 @@ TokensTextLogprobs = tuple[
 #
 # Assumes prompt logprobs were not requested.
 TextTextLogprobs = tuple[
-    list[str], str, Optional[Union[list[dict[str, float]], list[dict[str, Logprob]]]]
+    list[str], str, list[dict[str, float]] | list[dict[str, Logprob]] | None
 ]
 
 # Representation of generated sequence as a tuple of
@@ -81,18 +81,18 @@ TextTextLogprobs = tuple[
 TokensTextLogprobsPromptLogprobs = tuple[
     list[int],
     str,
-    Optional[Union[list[dict[int, float]], SampleLogprobs]],
-    Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]],
+    list[dict[int, float]] | SampleLogprobs | None,
+    list[dict[int, float] | None] | PromptLogprobs | None,
 ]
 
 
 def check_logprobs_close(
     *,
     outputs_0_lst: Sequence[
-        Union[TokensTextLogprobs, TokensTextLogprobsPromptLogprobs, TextTextLogprobs]
+        TokensTextLogprobs | TokensTextLogprobsPromptLogprobs | TextTextLogprobs
     ],
     outputs_1_lst: Sequence[
-        Union[TokensTextLogprobs, TokensTextLogprobsPromptLogprobs, TextTextLogprobs]
+        TokensTextLogprobs | TokensTextLogprobsPromptLogprobs | TextTextLogprobs
     ],
     name_0: str,
     name_1: str,
@@ -273,9 +273,9 @@ def build_model_context(
     model_id: str,
     runner: RunnerOption = "auto",
     dtype: ModelDType = "auto",
-    model_config_kwargs: Optional[dict[str, Any]] = None,
-    mm_processor_kwargs: Optional[dict[str, Any]] = None,
-    limit_mm_per_prompt: Optional[dict[str, int]] = None,
+    model_config_kwargs: dict[str, Any] | None = None,
+    mm_processor_kwargs: dict[str, Any] | None = None,
+    limit_mm_per_prompt: dict[str, int] | None = None,
     mm_processor_cache_gb: int = 0,
 ):
     """Creates an InputProcessingContext for a given model.
@@ -369,18 +369,18 @@ class ModelInfo:
     name: str
     architecture: str = ""
     dtype: str = "auto"
-    max_model_len: Optional[int] = None
+    max_model_len: int | None = None
     hf_dtype: str = "float32"
-    hf_overrides: Optional[dict[str, Any]] = None
+    hf_overrides: dict[str, Any] | None = None
     default_pooling_type: str = ""
     enable_test: bool = True
 
 
 @dataclass
 class EmbedModelInfo(ModelInfo):
-    mteb_score: Optional[float] = None
+    mteb_score: float | None = None
     is_matryoshka: bool = False
-    matryoshka_dimensions: Optional[list[int]] = None
+    matryoshka_dimensions: list[int] | None = None
 
 
 @dataclass
@@ -395,7 +395,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo):
 
 @dataclass
 class RerankModelInfo(ModelInfo):
-    mteb_score: Optional[float] = None
+    mteb_score: float | None = None
 
 
 @dataclass
@@ -411,14 +411,14 @@ class LASTPoolingRerankModelInfo(RerankModelInfo):
 @dataclass
 class GenerateModelInfo(ModelInfo):
     hf_dtype: str = "auto"
-    hf_ppl: Optional[float] = None
+    hf_ppl: float | None = None
 
 
 def dummy_hf_overrides(
     hf_config: PretrainedConfig,
     *,
     model_arch: str = "",
-    exist_overrides: Optional[dict[str, Any]] = None,
+    exist_overrides: dict[str, Any] | None = None,
     use_original_num_layers: bool = False,
 ) -> PretrainedConfig:
     """
@@ -507,8 +507,8 @@ def dummy_hf_overrides(
 
 def check_transformers_version(
     model: str,
-    min_transformers_version: Optional[str] = None,
-    max_transformers_version: Optional[str] = None,
+    min_transformers_version: str | None = None,
+    max_transformers_version: str | None = None,
 ):
     from .registry import _HfExamplesInfo
 
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index fe983990b90c8..531674c30f55f 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import numpy as np
 import pytest
@@ -32,7 +31,7 @@ def _dummy_elem(
     key: str,
     size: int,
     *,
-    rng: Optional[np.random.RandomState] = None,
+    rng: np.random.RandomState | None = None,
 ):
     if rng is None:
         data = torch.empty((size,), dtype=torch.int8)
@@ -51,7 +50,7 @@ def _dummy_item(
     modality: str,
     size_by_key: dict[str, int],
     *,
-    rng: Optional[np.random.RandomState] = None,
+    rng: np.random.RandomState | None = None,
 ):
     return MultiModalKwargsItem.from_elems(
         [_dummy_elem(modality, key, size, rng=rng) for key, size in size_by_key.items()]
@@ -61,7 +60,7 @@ def _dummy_item(
 def _dummy_items(
     size_by_key_modality: dict[str, dict[str, int]],
     *,
-    rng: Optional[np.random.RandomState] = None,
+    rng: np.random.RandomState | None = None,
 ):
     return MultiModalKwargsItems.from_seq(
         [
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index a542b068a42b6..2f04bc6695c81 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import nullcontext
-from typing import Optional, cast
+from typing import cast
 
 import numpy as np
 import pytest
@@ -1003,7 +1003,7 @@ class DummyProcessor:
         self,
         a: int = 0,
         c: int = 0,
-        return_tensors: Optional[str] = None,
+        return_tensors: str | None = None,
     ) -> dict[str, int]:
         return dict(a=a, c=c)
 
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
index a2a8d0ec9aba4..772824cdde8fe 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -1,15 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import base64
 import datetime
 import os
 import tempfile
 import urllib.request
 from collections.abc import Sequence
-from typing import Any, Union
+from typing import Any
 
 import albumentations
 import numpy as np
@@ -160,11 +158,11 @@ def read_geotiff(
 
 
 def load_image(
-    data: Union[list[str]],
+    data: list[str],
     path_type: str,
     mean: list[float] | None = None,
     std: list[float] | None = None,
-    indices: Union[list[int], None] | None = None,
+    indices: list[int] | None | None = None,
 ):
     """Build an input example by loading images in *file_paths*.
 
@@ -280,7 +278,7 @@ class PrithviMultimodalDataProcessor(IOProcessor):
         prompt: IOProcessorInput,
         request_id: str | None = None,
         **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
+    ) -> PromptType | Sequence[PromptType]:
         image_data = dict(prompt)
 
         if request_id:
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
index 21a5c3754c36f..d1d7873211f25 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Literal, Optional, TypedDict, Union
+from typing import Any, Literal, TypedDict
 
 import albumentations
 from pydantic import BaseModel
@@ -38,7 +38,7 @@ class ImagePrompt(BaseModel):
     """
 
 
-MultiModalPromptType = Union[ImagePrompt]
+MultiModalPromptType = ImagePrompt
 
 
 class ImageRequestOutput(BaseModel):
@@ -54,4 +54,4 @@ class ImageRequestOutput(BaseModel):
     type: Literal["path", "b64_json"]
     format: str
     data: str
-    request_id: Optional[str] = None
+    request_id: str | None = None
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index a22a10eab47dc..d1dae587d38eb 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -44,9 +43,9 @@ class MyGemma2Embedding(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids,
             positions,
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
index 9e6f5c3a77e3c..79af3ad842f5b 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -20,7 +19,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
     dummy_inputs=LlavaDummyInputsBuilder,
 )
 class MyLlava(LlavaForConditionalGeneration):
-    def compute_logits(self, hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         # this dummy model always predicts the first token
         logits = super().compute_logits(hidden_states)
         if logits is not None:
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
index c02299f5d44f2..f1e6e7b10f8b6 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -9,7 +8,7 @@ from vllm.model_executor.models.opt import OPTForCausalLM
 
 
 class MyOPTForCausalLM(OPTForCausalLM):
-    def compute_logits(self, hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         # this dummy model always predicts the first token
         logits = super().compute_logits(hidden_states)
         if logits is not None:
diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
index c4fe6ed197f6b..280b68514e193 100644
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py
@@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
-
-def dummy_platform_plugin() -> Optional[str]:
+def dummy_platform_plugin() -> str | None:
     return "vllm_add_dummy_platform.dummy_platform.DummyPlatform"
 
 
diff --git a/tests/quantization/test_blackwell_moe.py b/tests/quantization/test_blackwell_moe.py
index 3ad68172d771e..fc7f73e77c313 100644
--- a/tests/quantization/test_blackwell_moe.py
+++ b/tests/quantization/test_blackwell_moe.py
@@ -3,7 +3,6 @@
 
 import json
 import os
-from typing import Optional
 
 import pytest
 
@@ -30,7 +29,7 @@ def set_test_environment():
 dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}
 
 
-def can_initialize(model: str, extra_args: Optional[list[str]] = None):
+def can_initialize(model: str, extra_args: list[str] | None = None):
     # Server arguments
     extra_args = extra_args if extra_args is not None else []
     server_args = [
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 824d927724e02..ef7164c8813da 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -5,8 +5,6 @@
 Run `pytest tests/quantization/test_compressed_tensors.py`.
 """
 
-from typing import Optional
-
 import pytest
 import torch
 from compressed_tensors.quantization import QuantizationType
@@ -104,7 +102,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
             down_proj = layer.mlp.down_proj
 
             # assert zp for symmetric and asymmetric cases
-            def zp_valid(zp: Optional[torch.Tensor]):
+            def zp_valid(zp: torch.Tensor | None):
                 if is_symmetric:
                     return zp is None
 
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 1e65d9a995ce2..8875fdd1170aa 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -11,7 +11,6 @@ import importlib.metadata
 import os
 from dataclasses import dataclass
 from importlib.util import find_spec
-from typing import Optional
 
 import huggingface_hub
 import lm_eval
@@ -156,8 +155,8 @@ class AccuracyTestConfig:
     def get_model_args(
         self,
         tp_size: int,
-        model_max_len: Optional[int] = None,
-        kwargs: Optional[dict] = None,
+        model_max_len: int | None = None,
+        kwargs: dict | None = None,
     ) -> dict:
         if kwargs is None:
             kwargs = {}
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index b70c2ee7fe2e6..aeef4c2fd8a70 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -7,7 +7,7 @@ See https://github.com/vllm-project/vllm/issues/11926 for more details.
 Run `pytest tests/quantization/test_register_quantization_config.py`.
 """
 
-from typing import Any, Optional
+from typing import Any
 
 import pytest
 import torch
@@ -37,10 +37,10 @@ class FakeQuantLinearMethod(UnquantizedLinearMethod):
 
     def apply(
         self,
-        layer: "torch.nn.Module",
-        x: "torch.Tensor",
-        bias: Optional["torch.Tensor"] = None,
-    ) -> "torch.Tensor":
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
         """Perform fake quantization before the linear layer."""
 
         # Calculate the scales dynamically
@@ -72,7 +72,7 @@ class CustomQuantConfig(QuantizationConfig):
         """Name of the quantization method."""
         return "custom_quant"
 
-    def get_supported_act_dtypes(self) -> list["torch.dtype"]:
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
         """List of supported activation dtypes."""
         return [torch.float16, torch.bfloat16]
 
@@ -92,8 +92,8 @@ class CustomQuantConfig(QuantizationConfig):
         return CustomQuantConfig(num_bits=config.get("num_bits", 8))
 
     def get_quant_method(
-        self, layer: "torch.nn.Module", prefix: str
-    ) -> Optional["FakeQuantLinearMethod"]:
+        self, layer: torch.nn.Module, prefix: str
+    ) -> FakeQuantLinearMethod | None:
         """Get the quantize method to use for the quantized layer."""
         if isinstance(layer, LinearBase):
             return FakeQuantLinearMethod(num_bits=self.num_bits)
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index 788136e996815..ccd4ff8dd263a 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.reasoning import ReasoningParser
@@ -34,9 +33,9 @@ class StreamingReasoningReconstructor:
 def run_reasoning_extraction(
     reasoning_parser: ReasoningParser,
     model_output: list[str],
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
     streaming: bool = False,
-) -> tuple[Optional[str], Optional[str]]:
+) -> tuple[str | None, str | None]:
     if streaming:
         reconstructor = run_reasoning_extraction_streaming(
             reasoning_parser,
@@ -57,9 +56,9 @@ def run_reasoning_extraction(
 def run_reasoning_extraction_mistral(
     reasoning_parser: ReasoningParser,
     model_output: list[int],
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
     streaming: bool = False,
-) -> tuple[Optional[str], Optional[str]]:
+) -> tuple[str | None, str | None]:
     assert isinstance(reasoning_parser.model_tokenizer, MistralTokenizer), type(
         reasoning_parser.model_tokenizer
     )
@@ -86,8 +85,8 @@ def run_reasoning_extraction_mistral(
 def run_reasoning_extraction_nonstreaming(
     reasoning_parser: ReasoningParser,
     model_output: list[str],
-    request: Union[ChatCompletionRequest, None] = None,
-) -> tuple[Optional[str], Optional[str]]:
+    request: ChatCompletionRequest | None = None,
+) -> tuple[str | None, str | None]:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
     return reasoning_parser.extract_reasoning_content(
         model_output="".join(model_output), request=request
@@ -97,7 +96,7 @@ def run_reasoning_extraction_nonstreaming(
 def run_reasoning_extraction_streaming(
     reasoning_parser: ReasoningParser,
     model_deltas: list[str],
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
 ) -> StreamingReasoningReconstructor:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
     reconstructor = StreamingReasoningReconstructor()
@@ -129,7 +128,7 @@ def run_reasoning_extraction_streaming(
 def run_reasoning_extraction_streaming_mistral(
     reasoning_parser: ReasoningParser,
     model_deltas: list[int],
-    request: Union[ChatCompletionRequest, None] = None,
+    request: ChatCompletionRequest | None = None,
 ) -> StreamingReasoningReconstructor:
     assert isinstance(reasoning_parser.model_tokenizer, MistralTokenizer), type(
         reasoning_parser.model_tokenizer
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index fa0ca48f9bd9c..1b76b909629c9 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -6,8 +6,6 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
 
 """
 
-from typing import Optional
-
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
@@ -18,7 +16,7 @@ def _generate(
     prompt: str,
     num_prompt_tokens: int,
     temperature: float = 0,
-    bad_words: Optional[list[str]] = None,
+    bad_words: list[str] | None = None,
 ) -> list[int]:
     sampling_params = SamplingParams(
         temperature=temperature,
@@ -60,7 +58,7 @@ class TestOneTokenBadWord:
             output_token_ids = self._generate(llm, bad_words=[self.TARGET_TOKEN])
             assert self.target_token_id not in output_token_ids
 
-    def _generate(self, llm: LLM, bad_words: Optional[list[str]] = None) -> list[int]:
+    def _generate(self, llm: LLM, bad_words: list[str] | None = None) -> list[int]:
         return _generate(
             llm=llm,
             prompt=self.PROMPT,
@@ -155,7 +153,7 @@ class TestTwoTokenBadWord:
                 self.neighbour_token_id2 in output_token_ids
             )
 
-    def _generate(self, llm: LLM, bad_words: Optional[list[str]] = None) -> list[int]:
+    def _generate(self, llm: LLM, bad_words: list[str] | None = None) -> list[int]:
         return _generate(
             llm=llm,
             prompt=self.PROMPT,
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 14dcab7707d4e..f4b43a21daaa8 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Generator
-from typing import Any, Optional
+from typing import Any
 
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -52,7 +52,7 @@ def _run_incremental_decode(
     skip_special_tokens: bool,
     starting_index: int,
     spaces_between_special_tokens: bool = True,
-    fast: Optional[bool] = None,
+    fast: bool | None = None,
 ):
     prompt_token_ids = all_input_ids[:starting_index]
 
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
index de67c3e798c4e..d89737888aa2b 100644
--- a/tests/tokenization/test_tokenizer_registry.py
+++ b/tests/tokenization/test_tokenizer_registry.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.transformers_utils.tokenizer_base import TokenizerBase, TokenizerRegistry
@@ -61,11 +61,11 @@ class TestTokenizer(TokenizerBase):
 
     def __call__(
         self,
-        text: Union[str, list[str], list[int]],
-        text_pair: Optional[str] = None,
+        text: str | list[str] | list[int],
+        text_pair: str | None = None,
         add_special_tokens: bool = False,
         truncation: bool = False,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
     ):
         raise NotImplementedError()
 
@@ -79,17 +79,17 @@ class TestTokenizer(TokenizerBase):
         self,
         text: str,
         truncation: bool = False,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
     ) -> list[int]:
         raise NotImplementedError()
 
-    def encode(self, text: str, add_special_tokens: Optional[bool] = None) -> list[int]:
+    def encode(self, text: str, add_special_tokens: bool | None = None) -> list[int]:
         raise NotImplementedError()
 
     def apply_chat_template(
         self,
         messages: list["ChatCompletionMessageParam"],
-        tools: Optional[list[dict[str, Any]]] = None,
+        tools: list[dict[str, Any]] | None = None,
         **kwargs,
     ) -> list[int]:
         raise NotImplementedError()
@@ -97,9 +97,7 @@ class TestTokenizer(TokenizerBase):
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
         raise NotImplementedError()
 
-    def decode(
-        self, ids: Union[list[int], int], skip_special_tokens: bool = True
-    ) -> str:
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
         raise NotImplementedError()
 
     def convert_ids_to_tokens(
diff --git a/tests/tool_use/mistral/utils.py b/tests/tool_use/mistral/utils.py
index 13a234f8e26be..4d772ba63793d 100644
--- a/tests/tool_use/mistral/utils.py
+++ b/tests/tool_use/mistral/utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 from typing_extensions import TypedDict
 
@@ -9,9 +8,9 @@ from typing_extensions import TypedDict
 class ServerConfig(TypedDict, total=False):
     model: str
     arguments: list[str]
-    system_prompt: Optional[str]
-    supports_parallel: Optional[bool]
-    supports_rocm: Optional[bool]
+    system_prompt: str | None
+    supports_parallel: bool | None
+    supports_rocm: bool | None
 
 
 ARGS: list[str] = ["--max-model-len", "1024"]
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 44d42bbd72b04..6dcdd5ba2ce76 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -3,7 +3,6 @@
 
 import json
 from collections.abc import Generator
-from typing import Optional
 
 import partial_json_parser
 import pytest
@@ -248,7 +247,7 @@ def test_extract_tool_calls_streaming(
     function_names: list[str] = []
     function_args_strs: list[str] = []
     tool_call_idx: int = -1
-    tool_call_ids: list[Optional[str]] = []
+    tool_call_ids: list[str | None] = []
 
     for delta_message in stream_delta_message_generator(
         jamba_tool_parser, jamba_tokenizer, model_output
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index 159966365ec45..9af94a6a64a25 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
-from typing import Optional
 
 import openai
 import pytest
@@ -80,7 +79,7 @@ async def test_parallel_tool_calls(
         stream=True,
     )
 
-    role_name: Optional[str] = None
+    role_name: str | None = None
     finish_reason_count: int = 0
 
     tool_call_names: list[str] = []
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 20fa3b08c7b98..b4f0989b1b19c 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -3,7 +3,6 @@
 
 import json
 from collections.abc import Generator
-from typing import Optional
 
 import pytest
 
@@ -107,7 +106,7 @@ def stream_delta_message_generator(
     qwen3_tool_parser,
     qwen3_tokenizer: AnyTokenizer,
     model_output: str,
-    request: Optional[ChatCompletionRequest] = None,
+    request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
     all_token_ids = qwen3_tokenizer.encode(model_output, add_special_tokens=False)
 
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index eddb5a9b9f5ec..1133b949f2270 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -4,7 +4,6 @@
 
 import json
 from collections.abc import Generator
-from typing import Optional
 
 import pytest
 
@@ -259,7 +258,7 @@ def stream_delta_message_generator(
     seed_oss_tool_parser: SeedOssToolParser,
     seed_oss_tokenizer: AnyTokenizer,
     model_output: str,
-    request: Optional[ChatCompletionRequest] = None,
+    request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
     all_token_ids = seed_oss_tokenizer.encode(model_output, add_special_tokens=False)
 
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
index 64186aaac6a74..6614b6415a04f 100644
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
-from typing import Optional
 
 import openai
 import pytest
@@ -58,10 +57,10 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
 
     assert stop_reason == "tool_calls"
 
-    function_name: Optional[str] = None
+    function_name: str | None = None
     function_args_str: str = ""
-    tool_call_id: Optional[str] = None
-    role_name: Optional[str] = None
+    tool_call_id: str | None = None
+    role_name: str | None = None
     finish_reason_count: int = 0
 
     # make the same request, streaming
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index bdac878db4e76..8c27b2911f8f9 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -3,7 +3,6 @@
 
 import json
 from collections.abc import Generator
-from typing import Optional
 
 import pytest
 
@@ -52,7 +51,7 @@ def stream_delta_message_generator(
     xlam_tool_parser: xLAMToolParser,
     xlam_tokenizer: AnyTokenizer,
     model_output: str,
-    request: Optional[ChatCompletionRequest] = None,
+    request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
     all_token_ids = xlam_tokenizer.encode(model_output, add_special_tokens=False)
 
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 835d07608e408..38def6f874d7d 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
-from typing import Any, Optional
+from typing import Any
 
 from openai.types.chat import ChatCompletionMessageParam, ChatCompletionToolParam
 from typing_extensions import TypedDict
@@ -13,10 +13,10 @@ from tests.utils import VLLM_PATH
 class ServerConfig(TypedDict, total=False):
     model: str
     arguments: list[str]
-    system_prompt: Optional[str]
-    supports_parallel: Optional[bool]
-    supports_rocm: Optional[bool]
-    extended: Optional[bool]  # tests do not run in CI automatically
+    system_prompt: str | None
+    supports_parallel: bool | None
+    supports_rocm: bool | None
+    extended: bool | None  # tests do not run in CI automatically
 
 
 def patch_system_prompt(
diff --git a/tests/transformers_utils/test_config_parser_registry.py b/tests/transformers_utils/test_config_parser_registry.py
index 9372cb9d46d30..0931bd734f8f0 100644
--- a/tests/transformers_utils/test_config_parser_registry.py
+++ b/tests/transformers_utils/test_config_parser_registry.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from pathlib import Path
-from typing import Optional, Union
 
 import pytest
 from transformers import PretrainedConfig
@@ -15,10 +14,10 @@ from vllm.transformers_utils.config_parser_base import ConfigParserBase
 class CustomConfigParser(ConfigParserBase):
     def parse(
         self,
-        model: Union[str, Path],
+        model: str | Path,
         trust_remote_code: bool,
-        revision: Optional[str] = None,
-        code_revision: Optional[str] = None,
+        revision: str | None = None,
+        code_revision: str | None = None,
         **kwargs,
     ) -> tuple[dict, PretrainedConfig]:
         raise NotImplementedError
diff --git a/tests/utils.py b/tests/utils.py
index b853542c241fc..8fee507084382 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -15,10 +15,11 @@ import sys
 import tempfile
 import time
 import warnings
+from collections.abc import Callable
 from contextlib import ExitStack, contextmanager, suppress
 from multiprocessing import Process
 from pathlib import Path
-from typing import Any, Callable, Literal, Optional, Union
+from typing import Any, Literal
 from unittest.mock import patch
 
 import cloudpickle
@@ -94,7 +95,7 @@ class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
 
     def _start_server(
-        self, model: str, vllm_serve_args: list[str], env_dict: Optional[dict[str, str]]
+        self, model: str, vllm_serve_args: list[str], env_dict: dict[str, str] | None
     ) -> None:
         """Subclasses override this method to customize server process launch"""
         env = os.environ.copy()
@@ -117,11 +118,11 @@ class RemoteOpenAIServer:
         model: str,
         vllm_serve_args: list[str],
         *,
-        env_dict: Optional[dict[str, str]] = None,
-        seed: Optional[int] = 0,
+        env_dict: dict[str, str] | None = None,
+        seed: int | None = 0,
         auto_port: bool = True,
-        max_wait_seconds: Optional[float] = None,
-        override_hf_configs: Optional[dict[str, Any]] = None,
+        max_wait_seconds: float | None = None,
+        override_hf_configs: dict[str, Any] | None = None,
     ) -> None:
         if auto_port:
             if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
@@ -186,7 +187,7 @@ class RemoteOpenAIServer:
             # force kill if needed
             self.proc.kill()
 
-    def _poll(self) -> Optional[int]:
+    def _poll(self) -> int | None:
         """Subclasses override this method to customize process polling"""
         return self.proc.poll()
 
@@ -251,7 +252,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
     """Launch test server with custom child process"""
 
     def _start_server(
-        self, model: str, vllm_serve_args: list[str], env_dict: Optional[dict[str, str]]
+        self, model: str, vllm_serve_args: list[str], env_dict: dict[str, str] | None
     ) -> None:
         self.proc: Process = Process(
             target=self.child_process_fxn, args=(env_dict, model, vllm_serve_args)
@@ -262,12 +263,12 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
         self,
         model: str,
         vllm_serve_args: list[str],
-        child_process_fxn: Callable[[Optional[dict[str, str]], str, list[str]], None],
+        child_process_fxn: Callable[[dict[str, str] | None, str, list[str]], None],
         *,
-        env_dict: Optional[dict[str, str]] = None,
-        seed: Optional[int] = 0,
+        env_dict: dict[str, str] | None = None,
+        seed: int | None = 0,
         auto_port: bool = True,
-        max_wait_seconds: Optional[float] = None,
+        max_wait_seconds: float | None = None,
     ) -> None:
         """Store custom child process function then invoke superclass
         constructor which will indirectly launch it."""
@@ -281,7 +282,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
             max_wait_seconds=max_wait_seconds,
         )
 
-    def _poll(self) -> Optional[int]:
+    def _poll(self) -> int | None:
         return self.proc.exitcode
 
     def __exit__(self, exc_type, exc_value, traceback):
@@ -547,11 +548,11 @@ def compare_two_settings(
     model: str,
     arg1: list[str],
     arg2: list[str],
-    env1: Optional[dict[str, str]] = None,
-    env2: Optional[dict[str, str]] = None,
+    env1: dict[str, str] | None = None,
+    env2: dict[str, str] | None = None,
     *,
     method: str = "generate",
-    max_wait_seconds: Optional[float] = None,
+    max_wait_seconds: float | None = None,
 ) -> None:
     """
     Launch API server with two different sets of arguments/environments
@@ -577,10 +578,10 @@ def compare_two_settings(
 def compare_all_settings(
     model: str,
     all_args: list[list[str]],
-    all_envs: list[Optional[dict[str, str]]],
+    all_envs: list[dict[str, str] | None],
     *,
     method: str = "generate",
-    max_wait_seconds: Optional[float] = None,
+    max_wait_seconds: float | None = None,
 ) -> None:
     """
     Launch API server with several different sets of arguments/environments
@@ -785,8 +786,8 @@ def get_physical_device_indices(devices):
 def wait_for_gpu_memory_to_clear(
     *,
     devices: list[int],
-    threshold_bytes: Optional[int] = None,
-    threshold_ratio: Optional[float] = None,
+    threshold_bytes: int | None = None,
+    threshold_ratio: float | None = None,
     timeout_s: float = 120,
 ) -> None:
     assert threshold_bytes is not None or threshold_ratio is not None
@@ -1002,7 +1003,7 @@ def spawn_new_process_for_each_test(f: Callable[_P, None]) -> Callable[_P, None]
 
 
 def create_new_process_for_each_test(
-    method: Optional[Literal["spawn", "fork"]] = None,
+    method: Literal["spawn", "fork"] | None = None,
 ) -> Callable[[Callable[_P, None]], Callable[_P, None]]:
     """Creates a decorator that runs each test function in a new process.
 
@@ -1098,9 +1099,9 @@ async def completions_with_server_args(
     prompts: list[str],
     model_name: str,
     server_cli_args: list[str],
-    num_logprobs: Optional[int],
+    num_logprobs: int | None,
     max_wait_seconds: int = 240,
-    max_tokens: Union[int, list] = 5,
+    max_tokens: int | list = 5,
 ) -> list[Completion]:
     """Construct a remote OpenAI server, obtain an async client to the
     server & invoke the completions API to obtain completions.
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index 7fee73da15a2a..07706d4b956c5 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -3,7 +3,6 @@
 """Tests for v1 attention backends without GPUModelRunner dependency."""
 
 from functools import partial
-from typing import Optional, Union
 
 import pytest
 import torch
@@ -202,7 +201,7 @@ def run_attention_backend(
     key: torch.Tensor,
     value: torch.Tensor,
     kv_cache: torch.Tensor,
-    sliding_window: Optional[int] = None,
+    sliding_window: int | None = None,
 ) -> torch.Tensor:
     """Run attention computation using the specified backend's AttentionImpl."""
 
@@ -289,7 +288,7 @@ def run_attention_backend(
 def _test_backend_correctness(
     batch_spec: BatchSpec,
     model: str,
-    backend_to_test: list[Union[_Backend, str]],
+    backend_to_test: list[_Backend | str],
     mask_mod,
     *,
     block_size: int = 16,
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 3b6a9115435c4..35f7c61458f2d 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for v1 MLA backends without GPUModelRunner dependency."""
 
-from typing import Optional, Union
-
 import pytest
 import torch
 
@@ -81,8 +79,8 @@ def create_and_prepopulate_kv_cache(
     num_blocks: int,
     common_attn_metadata: CommonAttentionMetadata,
     randomize_blocks: bool = True,
-    kv_cache_dtype: Optional[str] = None,
-    scale: Union[float, torch.Tensor] = 1.0,
+    kv_cache_dtype: str | None = None,
+    scale: float | torch.Tensor = 1.0,
 ) -> torch.Tensor:
     """Create and prepopulate an MLA KV cache with context data.
 
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index 819cd81be358d..66a0169cbbd02 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -3,7 +3,6 @@
 """Utility functions for attention-related v1 tests."""
 
 from dataclasses import dataclass
-from typing import Optional, Union
 
 import pytest
 import torch
@@ -150,14 +149,14 @@ def create_vllm_config(
     model_name: str = "meta-llama/Meta-Llama-3-8B",
     tensor_parallel_size: int = 1,
     max_model_len: int = 1024,
-    dtype: Union[ModelDType, torch.dtype] = "auto",
+    dtype: ModelDType | torch.dtype = "auto",
     num_gpu_blocks: int = 1000,
     block_size: int = 16,
     max_num_seqs: int = 256,
     max_num_batched_tokens: int = 8192,
     enable_chunked_prefill: bool = True,
     add_mock_model_methods: bool = True,
-    hf_config_override: Optional[dict] = None,
+    hf_config_override: dict | None = None,
 ) -> VllmConfig:
     """Create a VllmConfig for testing with reasonable defaults."""
 
@@ -252,7 +251,7 @@ class BackendConfig:
     name: str
     env_vars: dict
     comp_config: dict  # compilation config
-    specific_gpu_arch: Optional[tuple] = None
+    specific_gpu_arch: tuple | None = None
 
 
 # Define all backend configurations of full cudagraph to be tested
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 714a540e86b5e..6b0a5e4b0e3f5 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import pytest
 import torch
@@ -62,9 +62,9 @@ def make_request(
     prompt_token_ids: list[int],
     block_size: int = 3,
     hash_fn: Callable = hash,
-    mm_positions: Optional[list[PlaceholderRange]] = None,
-    mm_hashes: Optional[list[str]] = None,
-    cache_salt: Optional[str] = None,
+    mm_positions: list[PlaceholderRange] | None = None,
+    mm_hashes: list[str] | None = None,
+    cache_salt: str | None = None,
 ):
     mm_features = []
     if mm_positions is not None:
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index d08c1bcc57bd5..a81644ce252ea 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -3,7 +3,7 @@
 """Compare the with and without prefix caching."""
 
 import copy
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import pytest
 import torch
@@ -55,10 +55,10 @@ def make_request(
     prompt_token_ids: list[int],
     block_size: int,
     hash_fn: Callable,
-    mm_positions: Optional[list[PlaceholderRange]] = None,
-    mm_hashes: Optional[list[str]] = None,
-    prompt_logprobs: Optional[int] = None,
-    cache_salt: Optional[str] = None,
+    mm_positions: list[PlaceholderRange] | None = None,
+    mm_hashes: list[str] | None = None,
+    prompt_logprobs: int | None = None,
+    cache_salt: str | None = None,
 ):
     mm_features = []
     if mm_positions is not None:
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index ff15af70b88bc..76408fba2e169 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
-from typing import Optional
 from unittest.mock import Mock
 
 import pytest
@@ -78,9 +77,7 @@ def test_get_num_unfinished_requests():
         (True, 5),
     ],
 )
-def test_schedule(
-    enable_prefix_caching: Optional[bool], prompt_logprobs: Optional[int]
-):
+def test_schedule(enable_prefix_caching: bool | None, prompt_logprobs: int | None):
     """Test scheduling.
     Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
     """
@@ -595,7 +592,7 @@ def test_check_stop_min_tokens():
     ],
 )
 def test_schedule_concurrent_batches(
-    enable_prefix_caching: Optional[bool], prompt_logprobs: Optional[int]
+    enable_prefix_caching: bool | None, prompt_logprobs: int | None
 ):
     scheduler = create_scheduler(
         max_num_batched_tokens=1024,
@@ -1323,14 +1320,14 @@ def create_scheduler_with_priority(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
-    enable_prefix_caching: Optional[bool] = None,
+    enable_prefix_caching: bool | None = None,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
     use_kv_connector: bool = False,
     num_blocks: int = 10000,
     block_size: int = 16,
-    max_model_len: Optional[int] = None,
-    num_speculative_tokens: Optional[int] = None,
+    max_model_len: int | None = None,
+    num_speculative_tokens: int | None = None,
 ) -> Scheduler:
     """Create scheduler with priority policy enabled.
 
@@ -1385,7 +1382,7 @@ def create_scheduler_with_priority(
         else None
     )
 
-    speculative_config: Optional[SpeculativeConfig] = None
+    speculative_config: SpeculativeConfig | None = None
     if num_speculative_tokens is not None:
         speculative_config = SpeculativeConfig(
             model="ngram", num_speculative_tokens=num_speculative_tokens
@@ -1420,12 +1417,12 @@ def create_scheduler_with_priority(
 def create_requests_with_priority(
     num_requests: int,
     priorities: list[int],
-    arrival_times: Optional[list[float]] = None,
+    arrival_times: list[float] | None = None,
     num_tokens: int = 10,
-    mm_positions: Optional[list[list[PlaceholderRange]]] = None,
+    mm_positions: list[list[PlaceholderRange]] | None = None,
     max_tokens: int = 16,
-    stop_token_ids: Optional[list[int]] = None,
-    prompt_logprobs: Optional[int] = None,
+    stop_token_ids: list[int] | None = None,
+    prompt_logprobs: int | None = None,
     starting_idx: int = 0,
 ):
     """Create requests with specified priorities and arrival times."""
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index c11cf3e817d19..c7df43359381b 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional, Union
 
 import torch
 
@@ -37,17 +36,17 @@ def create_scheduler(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
-    enable_prefix_caching: Optional[bool] = None,
+    enable_prefix_caching: bool | None = None,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
     use_kv_connector: bool = False,
     num_blocks: int = 10000,
     block_size: int = 16,
-    max_model_len: Optional[int] = None,
-    num_speculative_tokens: Optional[int] = None,
+    max_model_len: int | None = None,
+    num_speculative_tokens: int | None = None,
     skip_tokenizer_init: bool = False,
     async_scheduling: bool = False,
-) -> Union[Scheduler, AsyncScheduler]:
+) -> Scheduler | AsyncScheduler:
     """Create scheduler under test.
 
     Args:
@@ -102,7 +101,7 @@ def create_scheduler(
         else None
     )
 
-    speculative_config: Optional[SpeculativeConfig] = None
+    speculative_config: SpeculativeConfig | None = None
     if num_speculative_tokens is not None:
         speculative_config = SpeculativeConfig(
             model="ngram", num_speculative_tokens=num_speculative_tokens
@@ -141,10 +140,10 @@ _none_hash_initialized = False
 def create_requests(
     num_requests: int,
     num_tokens: int = 10,
-    mm_positions: Optional[list[list[PlaceholderRange]]] = None,
+    mm_positions: list[list[PlaceholderRange]] | None = None,
     max_tokens: int = 16,
-    stop_token_ids: Optional[list[int]] = None,
-    prompt_logprobs: Optional[int] = None,
+    stop_token_ids: list[int] | None = None,
+    prompt_logprobs: int | None = None,
     same_prompt: bool = False,
     block_size: int = 16,
 ) -> list[Request]:
diff --git a/tests/v1/distributed/test_async_llm_dp.py b/tests/v1/distributed/test_async_llm_dp.py
index 28bb91f34c39b..9465f946f858b 100644
--- a/tests/v1/distributed/test_async_llm_dp.py
+++ b/tests/v1/distributed/test_async_llm_dp.py
@@ -5,7 +5,6 @@ import asyncio
 import os
 from contextlib import ExitStack
 from dataclasses import dataclass
-from typing import Optional
 
 import pytest
 
@@ -35,8 +34,8 @@ async def generate(
     prompt: PromptType,
     output_kind: RequestOutputKind,
     max_tokens: int,
-    prompt_logprobs: Optional[int] = None,
-    data_parallel_rank: Optional[int] = None,
+    prompt_logprobs: int | None = None,
+    data_parallel_rank: int | None = None,
 ) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
@@ -91,9 +90,9 @@ async def test_load(
 
         def record(
             self,
-            scheduler_stats: Optional[SchedulerStats],
-            iteration_stats: Optional[IterationStats],
-            mm_cache_stats: Optional[MultiModalCacheStats] = None,
+            scheduler_stats: SchedulerStats | None,
+            iteration_stats: IterationStats | None,
+            mm_cache_stats: MultiModalCacheStats | None = None,
             engine_idx: int = 0,
         ):
             if iteration_stats:
diff --git a/tests/v1/distributed/test_internal_lb_dp.py b/tests/v1/distributed/test_internal_lb_dp.py
index 452d3682e65de..8f7459e95ef67 100644
--- a/tests/v1/distributed/test_internal_lb_dp.py
+++ b/tests/v1/distributed/test_internal_lb_dp.py
@@ -5,7 +5,7 @@ import os
 import threading
 import time
 import traceback
-from typing import Optional, cast
+from typing import cast
 
 import openai  # use the official client for correctness check
 import pytest
@@ -46,7 +46,7 @@ class MultinodeInternalLBServerManager:
         self.tp_size = tp_size
         self.api_server_count = api_server_count
         self.base_server_args = base_server_args
-        self.servers: list[Optional[tuple[RemoteOpenAIServer, list[str]]]] = [None] * (
+        self.servers: list[tuple[RemoteOpenAIServer, list[str]] | None] = [None] * (
             dp_size // dp_per_node
         )
         self.server_threads: list[threading.Thread] = []
@@ -175,7 +175,7 @@ class APIOnlyServerManager:
         self.tp_size = tp_size
         self.api_server_count = api_server_count
         self.base_server_args = base_server_args
-        self.servers: list[Optional[tuple[RemoteOpenAIServer, list[str]]]] = [None] * 2
+        self.servers: list[tuple[RemoteOpenAIServer, list[str]] | None] = [None] * 2
         self.server_threads: list[threading.Thread] = []
 
     def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
diff --git a/tests/v1/e2e/test_min_tokens.py b/tests/v1/e2e/test_min_tokens.py
index e00a3d58debe3..ec7ee0c3ebe64 100644
--- a/tests/v1/e2e/test_min_tokens.py
+++ b/tests/v1/e2e/test_min_tokens.py
@@ -13,8 +13,6 @@ Covers:
 5) Multiple stop conditions
 """
 
-from typing import Optional, Union
-
 import pytest
 
 from vllm import LLM, SamplingParams
@@ -33,9 +31,9 @@ class MinTokensTestCase:
         name: str,
         min_tokens: int,
         max_tokens: int,
-        stop: Optional[Union[str, list[str]]] = None,
-        expected_min_len: Optional[int] = None,
-        expected_exact_len: Optional[int] = None,
+        stop: str | list[str] | None = None,
+        expected_min_len: int | None = None,
+        expected_exact_len: int | None = None,
     ):
         self.name = name
         self.min_tokens = min_tokens
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index fbbbd0389c265..7dbdf0ca07105 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -1,9 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import random
-from typing import Any, Union
+from typing import Any
 
 import pytest
 import torch
@@ -34,7 +32,7 @@ def get_test_prompts(mm_enabled: bool):
     for kind in random_prompt_type_choices:
         word_choices = ["test", "temp", "hello", "where"]
         word = random.choice(word_choices)
-        prompt: Union[str, list[dict[str, Any]]] = ""
+        prompt: str | list[dict[str, Any]] = ""
         if kind == "repeat":
             prompt = f"""
             please repeat the word '{word}' 10 times.
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 444d771a18d63..8f715c085b5d1 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -3,7 +3,6 @@
 
 import asyncio
 from contextlib import ExitStack
-from typing import Optional
 from unittest.mock import MagicMock
 
 import pytest
@@ -53,8 +52,8 @@ async def generate(
     output_kind: RequestOutputKind,
     max_tokens: int,
     n: int = 1,
-    prompt_logprobs: Optional[int] = None,
-    cancel_after: Optional[int] = None,
+    prompt_logprobs: int | None = None,
+    cancel_after: int | None = None,
 ) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
@@ -545,9 +544,9 @@ async def collect_outputs(
     prompt: PromptType,
     sampling_params: SamplingParams,
     outputs_list: list[RequestOutput],
-) -> Optional[RequestOutput]:
+) -> RequestOutput | None:
     """Helper to collect outputs and return the final one."""
-    final_output: Optional[RequestOutput] = None
+    final_output: RequestOutput | None = None
     async for output in engine.generate(
         request_id=request_id, prompt=prompt, sampling_params=sampling_params
     ):
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index bc04d1f93f951..32eeaebbca917 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -8,7 +8,7 @@ import time
 import uuid
 from dataclasses import dataclass
 from threading import Thread
-from typing import Any, Optional, Union
+from typing import Any
 from unittest.mock import MagicMock
 
 import pytest
@@ -41,7 +41,7 @@ PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
 
 def make_request(
-    params: SamplingParams, prompt_tokens_ids: Optional[list[int]] = None
+    params: SamplingParams, prompt_tokens_ids: list[int] | None = None
 ) -> EngineCoreRequest:
     if not prompt_tokens_ids:
         prompt_tokens_ids = PROMPT_TOKENS
@@ -113,9 +113,7 @@ async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict):
 
 
 # Dummy utility function to monkey-patch into engine core.
-def echo(
-    self, msg: str, err_msg: Optional[str] = None, sleep: Optional[float] = None
-) -> str:
+def echo(self, msg: str, err_msg: str | None = None, sleep: float | None = None) -> str:
     print(f"echo util function called: {msg}, {err_msg}")
     if sleep is not None:
         time.sleep(sleep)
@@ -317,7 +315,7 @@ def echo_dc(
     self,
     msg: str,
     return_list: bool = False,
-) -> Union[MyDataclass, list[MyDataclass]]:
+) -> MyDataclass | list[MyDataclass]:
     print(f"echo dc util function called: {msg}")
     val = None if msg is None else MyDataclass(msg)
     # Return dataclass to verify support for returning custom types
@@ -330,7 +328,7 @@ def echo_dc_dict(
     self,
     msg: str,
     return_dict: bool = False,
-) -> Union[MyDataclass, dict[str, MyDataclass]]:
+) -> MyDataclass | dict[str, MyDataclass]:
     print(f"echo dc dict util function called: {msg}")
     val = None if msg is None else MyDataclass(msg)
     # Return dict of dataclasses to verify support for returning dicts
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 3f6f2211556f5..c1d5f8af79177 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import random
 from typing import TYPE_CHECKING
 
@@ -13,6 +11,8 @@ from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
 
 if TYPE_CHECKING:
     from tests.conftest import VllmRunner
+else:
+    VllmRunner = object
 
 MODEL = "facebook/opt-125m"
 DTYPE = "half"
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 9ebf7f09503e5..28ebe5166d962 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -3,7 +3,6 @@
 
 import math
 import time
-from typing import Optional
 
 import pytest
 
@@ -118,13 +117,13 @@ def test_incremental_detokenization(
 
 def _validate_logprobs(
     gen_tokens: dict[str, list[int]],
-    gen_logprobs: dict[str, Optional[SampleLogprobs]],
-    gen_prompt_logprobs: dict[str, Optional[PromptLogprobs]],
+    gen_logprobs: dict[str, SampleLogprobs | None],
+    gen_prompt_logprobs: dict[str, PromptLogprobs | None],
     gen_cumulative_logprob: dict[str, float],
     dtv: DummyOutputProcessorTestVectors,
     request_id_list: list[str],
-    num_sample_logprobs: Optional[int],
-    num_prompt_logprobs: Optional[int],
+    num_sample_logprobs: int | None,
+    num_prompt_logprobs: int | None,
 ) -> None:
     for req_idx, req_id in enumerate(request_id_list):
         new_tokens = gen_tokens[req_id]
@@ -413,8 +412,8 @@ def _validate_logprobs(
 @pytest.mark.parametrize("num_prompt_logprobs", [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
 def test_logprobs_processor(
     request_output_kind: RequestOutputKind,
-    num_sample_logprobs: Optional[int],
-    num_prompt_logprobs: Optional[int],
+    num_sample_logprobs: int | None,
+    num_prompt_logprobs: int | None,
     dummy_test_vectors,
 ):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
@@ -530,7 +529,7 @@ def test_logprobs_processor(
 )
 def test_stop_token(
     include_stop_str_in_output: bool,
-    num_sample_logprobs: Optional[int],
+    num_sample_logprobs: int | None,
     stop_token_type: str,
     ignore_eos: bool,
     dummy_test_vectors,
@@ -696,7 +695,7 @@ def test_stop_token(
 @pytest.mark.parametrize("num_sample_logprobs", [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
 def test_stop_string(
     include_stop_str_in_output: bool,
-    num_sample_logprobs: Optional[int],
+    num_sample_logprobs: int | None,
     dummy_test_vectors,
 ):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 9b720f6eb668e..23684a2c55cef 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -3,7 +3,7 @@
 
 import random
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import TypeAlias
 
 import torch
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -12,7 +12,7 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.v1.engine import EngineCoreOutput, FinishReason
 from vllm.v1.outputs import LogprobsLists, LogprobsTensors
 
-GeneralTokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+GeneralTokenizerType: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast
 
 # Number of sample logprobs to request when testing sample logprobs
 NUM_SAMPLE_LOGPROBS_UNDER_TEST = 5
@@ -332,16 +332,15 @@ class MockEngineCore:
         # For each request, for each sampled token offset,
         # a tuple of
         # (list of topk token ids, list of sample logprob vals, rank)
-        generated_logprobs_raw: Optional[
-            list[list[tuple[list[int], list[float], int]]]
-        ] = None,
+        generated_logprobs_raw: list[list[tuple[list[int], list[float], int]]]
+        | None = None,
         # For each request, a tuple of
         # (prompt logprob val matrix, prompt logprob tok id matrix);
         # each matrix has dimensions
         # (num prompt toks) x (num prompt logprobs+1)
-        prompt_logprobs_raw: Optional[list[LogprobsTensors]] = None,
-        eos_token_id: Optional[int] = None,
-        stop_token_ids: Optional[list[int]] = None,
+        prompt_logprobs_raw: list[LogprobsTensors] | None = None,
+        eos_token_id: int | None = None,
+        stop_token_ids: list[int] | None = None,
         ignore_eos: bool = False,
     ) -> None:
         self.num_requests = len(tokens_list)
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index b5d04679317e6..cca9729b9d0ba 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -2,8 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import json
 from dataclasses import fields
 from enum import Enum
@@ -30,6 +28,8 @@ from vllm.sampling_params import (
 
 if TYPE_CHECKING:
     from vllm.config.model import TokenizerMode
+else:
+    TokenizerMode = str
 
 NGRAM_SPEC_CONFIG = {
     "model": "[ngram]",
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 66dbed2b9fddf..c66a66b84b62f 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import openai  # use the official client for correctness check
 import pytest
@@ -195,7 +194,7 @@ async def test_too_many_completion_logprobs(
     [(MODEL_NAME, -1), (MODEL_NAME, 0), (MODEL_NAME, 1), (MODEL_NAME, None)],
 )
 async def test_prompt_logprobs_completion(
-    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int]
+    client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None
 ):
     params: dict = {
         "prompt": ["A robot may not injure another robot", "My name is"],
diff --git a/tests/v1/executor/test_executor.py b/tests/v1/executor/test_executor.py
index c8bcd62d66802..7293ad09a7176 100644
--- a/tests/v1/executor/test_executor.py
+++ b/tests/v1/executor/test_executor.py
@@ -3,7 +3,8 @@
 
 import asyncio
 import os
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any
 
 import pytest
 
@@ -20,12 +21,12 @@ class Mock: ...
 class CustomMultiprocExecutor(MultiprocExecutor):
     def collective_rpc(
         self,
-        method: Union[str, Callable],
-        timeout: Optional[float] = None,
+        method: str | Callable,
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict] = None,
+        kwargs: dict | None = None,
         non_block: bool = False,
-        unique_reply_rank: Optional[int] = None,
+        unique_reply_rank: int | None = None,
     ) -> list[Any]:
         # Drop marker to show that this was run
         with open(".marker", "w"):
diff --git a/tests/v1/kv_connector/unit/test_kv_load_failure_recovery.py b/tests/v1/kv_connector/unit/test_kv_load_failure_recovery.py
index 0902fbfe85f33..6b7b2226e758a 100644
--- a/tests/v1/kv_connector/unit/test_kv_load_failure_recovery.py
+++ b/tests/v1/kv_connector/unit/test_kv_load_failure_recovery.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable
+from collections.abc import Callable
 from unittest.mock import Mock
 
 import pytest
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index a1f53cb255630..0a73e2a78f2f3 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -9,7 +9,6 @@ import textwrap
 import time
 import uuid
 from collections import defaultdict
-from typing import Optional
 from unittest.mock import patch
 
 import pytest
@@ -154,7 +153,7 @@ class FakeNixlWrapper:
         local_block_descs_ids: list[int],
         remote_xfer_side_handle: int,
         remote_block_descs_ids: list[int],
-        notif_msg: Optional[bytes] = None,
+        notif_msg: bytes | None = None,
     ) -> int:
         return uuid.uuid4().int
 
@@ -982,7 +981,7 @@ class FakePlatform(Platform):
         return {"oot": ("oot",)}
 
     @classmethod
-    def get_nixl_memory_type(cls) -> Optional[str]:
+    def get_nixl_memory_type(cls) -> str | None:
         """
         Returns the nixl memory type for the current platform.
         """
diff --git a/tests/v1/kv_connector/unit/test_output_aggreagator.py b/tests/v1/kv_connector/unit/test_output_aggreagator.py
index d05cbe1a2fd46..2635b256b54ee 100644
--- a/tests/v1/kv_connector/unit/test_output_aggreagator.py
+++ b/tests/v1/kv_connector/unit/test_output_aggreagator.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from concurrent.futures import Future
-from typing import Optional
 
 import pytest
 
@@ -14,9 +13,9 @@ pytestmark = pytest.mark.cpu_test
 class DummyModelRunnerOutput(ModelRunnerOutput):
     def __init__(
         self,
-        finished_sending: Optional[set[str]] = None,
-        finished_recving: Optional[set[str]] = None,
-        invalid_block_ids: Optional[set[int]] = None,
+        finished_sending: set[str] | None = None,
+        finished_recving: set[str] | None = None,
+        invalid_block_ids: set[int] | None = None,
     ):
         self.kv_connector_output = KVConnectorOutput(
             finished_sending=finished_sending,
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 6f51b9bbcbdaa..b07fd0536a436 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import tempfile
 from collections import defaultdict
+from collections.abc import Callable
 from itertools import count
-from typing import Any, Callable, Optional
+from typing import Any
 
 import torch
 
@@ -147,7 +148,7 @@ _none_hash_initialized = False
 
 
 def create_request(
-    request_id: Optional[int] = None,
+    request_id: int | None = None,
     num_tokens: int = 10,
     common_prefix_len=0,
     max_tokens: int = 16,
@@ -168,7 +169,7 @@ def create_request(
         init_none_hash(hash_fn)
         _none_hash_initialized = True
 
-    kv_transfer_params: Optional[dict[str, Any]] = None
+    kv_transfer_params: dict[str, Any] | None = None
 
     if do_remote_decode:
         assert not do_remote_prefill
@@ -205,9 +206,9 @@ def create_request(
 
 def create_model_runner_output(
     reqs: list[Request],
-    finished_sending: Optional[set[str]] = None,
-    finished_recving: Optional[set[str]] = None,
-    invalid_block_ids: Optional[set[int]] = None,
+    finished_sending: set[str] | None = None,
+    finished_recving: set[str] | None = None,
+    invalid_block_ids: set[int] | None = None,
     use_eos: bool = False,
     token_id: int = 0,
 ) -> ModelRunnerOutput:
diff --git a/tests/v1/kv_offload/test_cpu_manager.py b/tests/v1/kv_offload/test_cpu_manager.py
index 57884f846b513..4f90ca022ceff 100644
--- a/tests/v1/kv_offload/test_cpu_manager.py
+++ b/tests/v1/kv_offload/test_cpu_manager.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Optional
 
 import numpy as np
 
@@ -29,7 +28,7 @@ def to_hashes(int_hashes: list[int]) -> list[BlockHash]:
 
 
 def verify_store_output(
-    prepare_store_output: Optional[PrepareStoreOutput],
+    prepare_store_output: PrepareStoreOutput | None,
     expected_prepare_store_output: ExpectedPrepareStoreOutput,
 ):
     assert prepare_store_output is not None
diff --git a/tests/v1/logits_processors/test_correctness.py b/tests/v1/logits_processors/test_correctness.py
index 538b6281f5a07..9682a7c0c8b35 100644
--- a/tests/v1/logits_processors/test_correctness.py
+++ b/tests/v1/logits_processors/test_correctness.py
@@ -3,7 +3,7 @@
 
 import random
 from collections.abc import Callable
-from typing import NamedTuple, Optional, Union
+from typing import NamedTuple, TypeAlias
 
 import numpy as np
 import pytest
@@ -48,7 +48,7 @@ REQS_PER_LOGITPROC = 50
 STR_NO_LOGITPROC = "none"
 
 # LogitsProcessor subclass or "none"
-LogitprocType = Union[type[LogitsProcessor], str]
+LogitprocType: TypeAlias = type[LogitsProcessor] | str
 
 
 class LogitsProcsRequestParams:
@@ -435,7 +435,7 @@ class LogitsprocTestHelpers(NamedTuple):
     """Supports setting up and validating logitsprocs unit tests."""
 
     eval_fxn: Callable
-    gen_request_fxn: Optional[Callable] = None
+    gen_request_fxn: Callable | None = None
 
 
 logitsprocs_test_mapping = {
@@ -471,7 +471,7 @@ def _generate_fake_step_update(
     workload_params: list[LogitsProcsRequestParams],
     wdx: int,
     batch_update_builder: BatchUpdateBuilder,
-) -> tuple[Optional[BatchUpdate], int, int]:
+) -> tuple[BatchUpdate | None, int, int]:
     batch_size = len(persistent_batch)
     workload_size = len(workload_params)
     workload_reqs_remaining = workload_size - wdx
diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py
index 95ddb18491691..1899737737f4b 100644
--- a/tests/v1/logits_processors/test_custom_offline.py
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 import sys
-from typing import Any, Union
+from typing import Any
 
 import pytest
 
@@ -159,7 +159,7 @@ def test_custom_logitsprocs(monkeypatch, logitproc_source: CustomLogitprocSource
         _run_test({}, logitproc_loaded=True)
         return
 
-    kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}
+    kwargs: dict[str, list[str | type[LogitsProcessor]]] = {}
     if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
         # Scenario: load logitproc based on fully-qualified class name (FQCN)
         # Inject dummy module which defines logitproc
diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py
index 9c5b4ff0ba170..0d902b46bed5a 100644
--- a/tests/v1/logits_processors/test_custom_online.py
+++ b/tests/v1/logits_processors/test_custom_online.py
@@ -4,7 +4,7 @@
 import os
 import random
 import sys
-from typing import Any, Optional
+from typing import Any
 
 import openai
 import pytest
@@ -25,7 +25,7 @@ from tests.v1.logits_processors.utils import entry_points as fake_entry_points
 
 
 def _server_with_logitproc_entrypoint(
-    env_dict: Optional[dict[str, str]],
+    env_dict: dict[str, str] | None,
     model: str,
     vllm_serve_args: list[str],
 ) -> None:
@@ -48,7 +48,7 @@ def _server_with_logitproc_entrypoint(
 
 
 def _server_with_logitproc_module(
-    env_dict: Optional[dict[str, str]],
+    env_dict: dict[str, str] | None,
     model: str,
     vllm_serve_args: list[str],
 ) -> None:
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
index 9a1d5505a5f99..36cffebb3b457 100644
--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@@ -3,7 +3,7 @@
 
 import types
 from enum import Enum, auto
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
@@ -61,7 +61,7 @@ class DummyLogitsProcessor(LogitsProcessor):
         """Never impacts greedy sampling"""
         return False
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
         process_dict_updates(
             self.req_info,
             batch_update,
@@ -145,7 +145,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
     def new_req_logits_processor(
         self,
         params: SamplingParams,
-    ) -> Optional[RequestLogitsProcessor]:
+    ) -> RequestLogitsProcessor | None:
         """This method returns a new request-level logits processor, customized
         to the `target_token` value associated with a particular request.
 
@@ -159,7 +159,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
         Returns:
           `Callable` request logits processor, or None
         """
-        target_token: Optional[Any] = params.extra_args and params.extra_args.get(
+        target_token: Any | None = params.extra_args and params.extra_args.get(
             "target_token"
         )
         if target_token is None:
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 8df10f8c3afa5..4c11af2fa3a11 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Any
 
 import pytest
 import torch
@@ -24,7 +24,7 @@ def rejection_sampler():
 def create_logits_tensor(
     output_token_ids: list[list[int]],
     vocab_size: int = 100,
-    token_idx_to_override: Optional[int] = None,
+    token_idx_to_override: int | None = None,
 ) -> torch.Tensor:
     """Helper function to create logits tensor that
     will produce desired token ids on argmax"""
@@ -43,18 +43,18 @@ def create_logits_tensor(
 
 def create_sampling_metadata(
     all_greedy: bool,
-    output_token_ids: Optional[list[list[int]]] = None,
-    prompt_token_ids: Optional[torch.Tensor] = None,
-    spec_token_ids: Optional[torch.Tensor] = None,
-    temperature: Optional[torch.Tensor] = None,
-    top_k: Optional[torch.Tensor] = None,
-    top_p: Optional[torch.Tensor] = None,
-    generators: Optional[dict[int, Any]] = None,
-    frequency_penalties: Optional[list[float]] = None,
-    presence_penalties: Optional[list[float]] = None,
-    repetition_penalties: Optional[list[float]] = None,
-    bad_words_token_ids: Optional[dict[int, list[list[int]]]] = None,
-    allowed_token_ids_mask: Optional[torch.Tensor] = None,
+    output_token_ids: list[list[int]] | None = None,
+    prompt_token_ids: torch.Tensor | None = None,
+    spec_token_ids: torch.Tensor | None = None,
+    temperature: torch.Tensor | None = None,
+    top_k: torch.Tensor | None = None,
+    top_p: torch.Tensor | None = None,
+    generators: dict[int, Any] | None = None,
+    frequency_penalties: list[float] | None = None,
+    presence_penalties: list[float] | None = None,
+    repetition_penalties: list[float] | None = None,
+    bad_words_token_ids: dict[int, list[list[int]]] | None = None,
+    allowed_token_ids_mask: torch.Tensor | None = None,
 ) -> SamplingMetadata:
     """Create a v1 sampling metadata object with all_greedy set
     to the given value. Either all greedy or all random sampling
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index b1c63327b852b..5d457762fc644 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -3,7 +3,7 @@
 
 from collections.abc import Iterator
 from enum import Enum
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 
 import regex as re
 import torch
@@ -23,7 +23,7 @@ class BatchLogprobsComposition(Enum):
     SAMPLE_PROMPT = 3
 
 
-BatchLogprobsSpecType = list[tuple[Optional[int], Optional[int]]]
+BatchLogprobsSpecType = list[tuple[int | None, int | None]]
 
 
 def get_test_batch(
@@ -222,8 +222,8 @@ def create_allowed_token_ids(
     vocab_size: int,
     num_allowed_token_ids: int,
     device: torch.device,
-) -> Optional[torch.Tensor]:
-    mask: Optional[torch.Tensor] = None
+) -> torch.Tensor | None:
+    mask: torch.Tensor | None = None
     for i in range(batch_size):
         if i % 2 == 1:
             continue
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 0f0a3722ef2dd..47d05a20a65df 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 from unittest import mock
 
 import pytest
@@ -39,7 +38,7 @@ eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
 def _create_proposer(
     method: str,
     num_speculative_tokens: int,
-    speculative_token_tree: Optional[list[tuple[int, ...]]] = None,
+    speculative_token_tree: list[tuple[int, ...]] | None = None,
 ) -> EagleProposer:
     model_config = ModelConfig(model=model_dir, runner="generate", max_model_len=100)
 
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index b31a2f27f54b0..b365e75d5514c 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import Optional
 
 import torch
 
@@ -37,7 +36,7 @@ def forward_attention(
     slot_mapping: torch.Tensor,
     seqlen_k: int,
     backend: _Backend,
-    spec_token_tree: Optional[str] = None,
+    spec_token_tree: str | None = None,
     num_spec_tokens: int = 0,
 ) -> torch.Tensor:
     batch_size, q_len, num_heads, dim_per_head = q.shape
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index a306a2b040d3a..00749c5415c8e 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import UserDict
 from dataclasses import dataclass
-from typing import Optional
 
 import msgspec
 import numpy as np
@@ -100,7 +99,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
 
 
 class MyRequest(msgspec.Struct):
-    mm: Optional[list[MultiModalKwargsItems]]
+    mm: list[MultiModalKwargsItems] | None
 
 
 def test_multimodal_kwargs():
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index f3495b00d3d4c..0d53a02476fab 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -5,8 +5,6 @@
 Run `pytest tests/v1/tpu/test_basic.py`.
 """
 
-from __future__ import annotations
-
 from typing import TYPE_CHECKING
 
 import pytest
@@ -16,6 +14,8 @@ from vllm.platforms import current_platform
 
 if TYPE_CHECKING:
     from tests.conftest import VllmRunner
+else:
+    VllmRunner = object
 
 MODELS = [
     "Qwen/Qwen2.5-1.5B-Instruct",
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
index b7b6835c40ccb..e230491cddb01 100644
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
@@ -5,8 +5,6 @@
 Run `pytest tests/v1/tpu/test_perf.py`.
 """
 
-from __future__ import annotations
-
 import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
@@ -20,6 +18,8 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 
 if TYPE_CHECKING:
     from tests.conftest import VllmRunner
+else:
+    VllmRunner = object
 
 
 @dataclass
diff --git a/tests/v1/tracing/test_tracing.py b/tests/v1/tracing/test_tracing.py
index 505da41631438..11d9d18ead7db 100644
--- a/tests/v1/tracing/test_tracing.py
+++ b/tests/v1/tracing/test_tracing.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa
 # type: ignore
-from __future__ import annotations
-
 import threading
 from collections.abc import Iterable
 from concurrent import futures
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 5a598dcab7189..5ab67dcf761e4 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -3,7 +3,6 @@
 
 import inspect
 from collections.abc import Sequence
-from typing import Optional
 
 import numpy as np
 import pytest
@@ -271,7 +270,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
         reqs, req_ids_retained, input_batch.req_id_to_index, device=torch.device(device)
     )
 
-    def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
+    def same(t1: torch.Tensor | None, t2: torch.Tensor | None) -> bool:
         return (t1 is None and t2 is None) or (
             t1 is not None and t2 is not None and torch.allclose(t1, t2)
         )
diff --git a/tests/v1/worker/test_worker_memory_snapshot.py b/tests/v1/worker/test_worker_memory_snapshot.py
index cbfb9a8dc0b60..b9b2e076fd396 100644
--- a/tests/v1/worker/test_worker_memory_snapshot.py
+++ b/tests/v1/worker/test_worker_memory_snapshot.py
@@ -4,8 +4,7 @@
 import multiprocessing as mp
 import os
 import tempfile
-from multiprocessing import Queue
-from typing import Optional
+from multiprocessing.queues import Queue
 from unittest.mock import patch
 
 import pytest
@@ -16,7 +15,7 @@ from vllm.utils import MemorySnapshot
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
 
 # Global queue to track operation order across processes
-_QUEUE: Optional[Queue] = None
+_QUEUE: Queue | None = None
 
 
 def track_operation(operation: str, rank: int):
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
index e2cab92ea22b2..9746c3964e21f 100644
--- a/tests/vllm_test_utils/vllm_test_utils/blame.py
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -5,8 +5,7 @@ import contextlib
 import dataclasses
 import sys
 import traceback
-from collections.abc import Generator
-from typing import Callable
+from collections.abc import Callable, Generator
 
 
 @dataclasses.dataclass
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
index e2f1212ed554b..ba22bde8795b3 100644
--- a/tests/vllm_test_utils/vllm_test_utils/monitor.py
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -5,8 +5,8 @@ import contextlib
 import dataclasses
 import sys
 import traceback
-from collections.abc import Generator
-from typing import Callable, Generic, TypeVar
+from collections.abc import Callable, Generator
+from typing import Generic, TypeVar
 
 _T = TypeVar("_T")
 
diff --git a/tools/check_init_lazy_imports.py b/tools/check_init_lazy_imports.py
index 9255aa17db6a6..197cc8ff8f5ed 100644
--- a/tools/check_init_lazy_imports.py
+++ b/tools/check_init_lazy_imports.py
@@ -5,8 +5,6 @@ i.e: appears only within the ``if typing.TYPE_CHECKING:`` guard,
 **except** for a short whitelist.
 """
 
-from __future__ import annotations
-
 import ast
 import pathlib
 import sys
diff --git a/tools/enforce_regex_import.py b/tools/enforce_regex_import.py
index 69f43cadc7677..a29952e92264d 100644
--- a/tools/enforce_regex_import.py
+++ b/tools/enforce_regex_import.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import subprocess
 from pathlib import Path
 
diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 22ee08535bddb..de29028da618e 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -20,7 +20,6 @@ Args:
 
 import subprocess
 import sys
-from typing import Optional
 
 import regex as re
 
@@ -96,8 +95,8 @@ def group_files(changed_files: list[str]) -> dict[str, list[str]]:
 
 def mypy(
     targets: list[str],
-    python_version: Optional[str],
-    follow_imports: Optional[str],
+    python_version: str | None,
+    follow_imports: str | None,
     file_group: str,
 ) -> int:
     """
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index cdab004366f9d..a049dc0425dd6 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -7,7 +7,7 @@ import json
 import math
 import os
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any
 
 import matplotlib.pyplot as plt
 import pandas as pd
@@ -373,7 +373,7 @@ def plot_trace_df(
     traces_df: pd.DataFrame,
     plot_metric: str,
     plot_title: str,
-    output: Optional[Path] = None,
+    output: Path | None = None,
 ):
     def get_phase_description(traces_df: pd.DataFrame, phase: str) -> str:
         phase_df = traces_df.query(f'phase == "{phase}"')
diff --git a/vllm/_bc_linter.py b/vllm/_bc_linter.py
index af68396af0b5a..2929a8bce85ac 100644
--- a/vllm/_bc_linter.py
+++ b/vllm/_bc_linter.py
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # vllm/_bc_linter.py
-from __future__ import annotations
-
-from typing import Any, Callable, TypeVar, overload
+from collections.abc import Callable
+from typing import Any, TypeVar, overload
 
 T = TypeVar("T")
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index eac0a5009e81f..f1ed3bac80c60 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Literal, Optional, Union
+from typing import TYPE_CHECKING, Literal
 
 import torch
 
@@ -37,7 +37,7 @@ def paged_attention_v1(
     seq_lens: torch.Tensor,
     block_size: int,
     max_seq_len: int,
-    alibi_slopes: Optional[torch.Tensor],
+    alibi_slopes: torch.Tensor | None,
     kv_cache_dtype: str,
     k_scale: torch.Tensor,
     v_scale: torch.Tensor,
@@ -84,7 +84,7 @@ def paged_attention_v2(
     seq_lens: torch.Tensor,
     block_size: int,
     max_seq_len: int,
-    alibi_slopes: Optional[torch.Tensor],
+    alibi_slopes: torch.Tensor | None,
     kv_cache_dtype: str,
     k_scale: torch.Tensor,
     v_scale: torch.Tensor,
@@ -132,14 +132,14 @@ def paged_attention_rocm(
     scale: float,
     block_tables: torch.Tensor,
     seq_lens: torch.Tensor,
-    query_start_loc: Optional[torch.Tensor],
+    query_start_loc: torch.Tensor | None,
     block_size: int,
     max_seq_len: int,
-    alibi_slopes: Optional[torch.Tensor],
+    alibi_slopes: torch.Tensor | None,
     kv_cache_dtype: str,
     k_scale: torch.Tensor,
     v_scale: torch.Tensor,
-    fp8_out_scale: Optional[torch.Tensor] = None,
+    fp8_out_scale: torch.Tensor | None = None,
     mfma_type: str = "fp8" if envs.VLLM_ROCM_FP8_MFMA_PAGE_ATTN else "f16",
 ) -> None:
     torch.ops._rocm_C.paged_attention(
@@ -186,7 +186,7 @@ def merge_attn_states(
     prefix_lse: torch.Tensor,
     suffix_output: torch.Tensor,
     suffix_lse: torch.Tensor,
-    output_lse: Optional[torch.Tensor] = None,
+    output_lse: torch.Tensor | None = None,
 ) -> None:
     torch.ops._C.merge_attn_states(
         output, output_lse, prefix_output, prefix_lse, suffix_output, suffix_lse
@@ -314,7 +314,7 @@ def convert_vertical_slash_indexes_mergehead(
 def rotary_embedding(
     positions: torch.Tensor,
     query: torch.Tensor,
-    key: Optional[torch.Tensor],
+    key: torch.Tensor | None,
     head_size: int,
     cos_sin_cache: torch.Tensor,
     is_neox: bool,
@@ -408,8 +408,8 @@ def rms_norm_dynamic_per_token_quant(
     weight: torch.Tensor,
     epsilon: float,
     quant_dtype: torch.dtype,
-    scale_ub: Optional[torch.Tensor] = None,
-    residual: Optional[torch.Tensor] = None,
+    scale_ub: torch.Tensor | None = None,
+    residual: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=quant_dtype)
     scales = torch.empty(
@@ -527,14 +527,14 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
     @register_fake("_C::gptq_marlin_gemm")
     def _gptq_marlin_gemm_fake(
         a: torch.Tensor,
-        c: Optional[torch.Tensor],
+        c: torch.Tensor | None,
         b_q_weight: torch.Tensor,
-        b_bias: Optional[torch.Tensor],
+        b_bias: torch.Tensor | None,
         b_scales: torch.Tensor,
-        global_scale: Optional[torch.Tensor],
-        b_zeros: Optional[torch.Tensor],
-        g_idx: Optional[torch.Tensor],
-        perm: Optional[torch.Tensor],
+        global_scale: torch.Tensor | None,
+        b_zeros: torch.Tensor | None,
+        g_idx: torch.Tensor | None,
+        perm: torch.Tensor | None,
         workspace: torch.Tensor,
         b_q_type_id: int,
         size_m: torch.SymInt,
@@ -582,13 +582,13 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
         # b_q Should be the tensor returned by machete_prepack_B
         b_q: torch.Tensor,
         b_type: ScalarType,
-        out_type: Optional[torch.dtype] = None,
-        b_group_scales: Optional[torch.Tensor] = None,
-        b_group_zeros: Optional[torch.Tensor] = None,
-        b_group_size: Optional[int] = None,
-        b_channel_scales: Optional[torch.Tensor] = None,
-        a_token_scales: Optional[torch.Tensor] = None,
-        schedule: Optional[str] = None,
+        out_type: torch.dtype | None = None,
+        b_group_scales: torch.Tensor | None = None,
+        b_group_zeros: torch.Tensor | None = None,
+        b_group_size: int | None = None,
+        b_channel_scales: torch.Tensor | None = None,
+        a_token_scales: torch.Tensor | None = None,
+        schedule: str | None = None,
     ) -> torch.Tensor:
         m = a.size(0)
         n = b_q.size(1)
@@ -599,7 +599,7 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
         b_q_weight: torch.Tensor,
         a_type: torch.dtype,
         b_type: ScalarType,
-        group_scales_type: Optional[torch.dtype],
+        group_scales_type: torch.dtype | None,
     ) -> torch.Tensor:
         return torch.empty_like(b_q_weight, memory_format=torch.contiguous_format)
 
@@ -612,8 +612,8 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"):
         b_group_size: int,
         b_channel_scales: torch.Tensor,
         a_token_scales: torch.Tensor,
-        out_type: Optional[torch.dtype] = None,
-        maybe_schedule: Optional[str] = None,
+        out_type: torch.dtype | None = None,
+        maybe_schedule: str | None = None,
     ) -> torch.Tensor:
         m = a.size(0)
         n = b_q.size(1)
@@ -636,7 +636,7 @@ if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
         a: torch.Tensor,
         b_qweight: torch.Tensor,
         b_scales: torch.Tensor,
-        b_qzeros: Optional[torch.Tensor],
+        b_qzeros: torch.Tensor | None,
         n: torch.SymInt,
         group_size: torch.SymInt,
         sm_count: torch.SymInt,
@@ -657,7 +657,7 @@ if hasattr(torch.ops._C, "ggml_dequantize"):
         quant_type: int,
         m: torch.SymInt,
         n: torch.SymInt,
-        dtype: Optional[torch.dtype] = None,
+        dtype: torch.dtype | None = None,
     ) -> torch.Tensor:
         return torch.empty((m, n), dtype=torch.float16, device=W.device)
 
@@ -760,7 +760,7 @@ def cutlass_scaled_mm(
     scale_a: torch.Tensor,
     scale_b: torch.Tensor,
     out_dtype: torch.dtype,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     """
     `cutlass_scaled_mm` implements a fused version of
@@ -812,8 +812,8 @@ def cutlass_scaled_mm_azp(
     scale_b: torch.Tensor,
     out_dtype: torch.dtype,
     azp_adj: torch.Tensor,
-    azp: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
+    azp: torch.Tensor | None = None,
+    bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     """
     :param azp_adj: In the per-tensor case, this should include the azp.
@@ -890,7 +890,7 @@ def cutlass_scaled_sparse_mm(
     scale_a: torch.Tensor,
     scale_b: torch.Tensor,
     out_dtype: torch.dtype,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     """
     Performs a scaled sparse matrix multiplication using Cutlass.
@@ -940,7 +940,7 @@ def get_cutlass_moe_mm_data(
     num_experts: int,
     n: int,
     k: int,
-    blockscale_offsets: Optional[torch.Tensor] = None,
+    blockscale_offsets: torch.Tensor | None = None,
 ):
     """
     Prepare data necessary to perform CUTLASS grouped matrix multiplications
@@ -986,7 +986,7 @@ def get_cutlass_moe_mm_problem_sizes(
     num_experts: int,
     n: int,
     k: int,
-    blockscale_offsets: Optional[torch.Tensor] = None,
+    blockscale_offsets: torch.Tensor | None = None,
 ):
     """
     Compute only the per-expert problem sizes needed by the two grouped matrix
@@ -1195,14 +1195,14 @@ def awq_marlin_moe_repack(
 
 def gptq_marlin_gemm(
     a: torch.Tensor,
-    c: Optional[torch.Tensor],
+    c: torch.Tensor | None,
     b_q_weight: torch.Tensor,
-    b_bias: Optional[torch.Tensor],
+    b_bias: torch.Tensor | None,
     b_scales: torch.Tensor,
-    global_scale: Optional[torch.Tensor],
-    b_zeros: Optional[torch.Tensor],
-    g_idx: Optional[torch.Tensor],
-    perm: Optional[torch.Tensor],
+    global_scale: torch.Tensor | None,
+    b_zeros: torch.Tensor | None,
+    g_idx: torch.Tensor | None,
+    perm: torch.Tensor | None,
     workspace: torch.Tensor,
     b_q_type: ScalarType,
     size_m: int,
@@ -1239,11 +1239,11 @@ def gptq_marlin_gemm(
 def machete_supported_schedules(
     a_type: torch.dtype,
     b_type: ScalarType,
-    group_scales_type: Optional[torch.dtype],
-    group_zeros_type: Optional[torch.dtype] = None,
-    channel_scales_type: Optional[torch.dtype] = None,
-    token_scales_type: Optional[torch.dtype] = None,
-    out_type: Optional[torch.dtype] = None,
+    group_scales_type: torch.dtype | None,
+    group_zeros_type: torch.dtype | None = None,
+    channel_scales_type: torch.dtype | None = None,
+    token_scales_type: torch.dtype | None = None,
+    out_type: torch.dtype | None = None,
 ) -> list[str]:
     return torch.ops._C.machete_supported_schedules(
         a_type,
@@ -1261,13 +1261,13 @@ def machete_mm(
     # b_q Should be the tensor returned by machete_prepack_B
     b_q: torch.Tensor,
     b_type: ScalarType,
-    out_type: Optional[torch.dtype] = None,
-    b_group_scales: Optional[torch.Tensor] = None,
-    b_group_zeros: Optional[torch.Tensor] = None,
-    b_group_size: Optional[int] = None,
-    b_channel_scales: Optional[torch.Tensor] = None,
-    a_token_scales: Optional[torch.Tensor] = None,
-    schedule: Optional[str] = None,
+    out_type: torch.dtype | None = None,
+    b_group_scales: torch.Tensor | None = None,
+    b_group_zeros: torch.Tensor | None = None,
+    b_group_size: int | None = None,
+    b_channel_scales: torch.Tensor | None = None,
+    a_token_scales: torch.Tensor | None = None,
+    schedule: str | None = None,
 ) -> torch.Tensor:
     return torch.ops._C.machete_mm(
         a,
@@ -1287,7 +1287,7 @@ def machete_prepack_B(
     b_q_weight: torch.Tensor,
     a_type: torch.dtype,
     b_type: ScalarType,
-    group_scales_type: Optional[torch.dtype],
+    group_scales_type: torch.dtype | None,
 ) -> torch.Tensor:
     return torch.ops._C.machete_prepack_B(
         b_q_weight, a_type, b_type.id, group_scales_type
@@ -1303,8 +1303,8 @@ def cutlass_w4a8_mm(
     b_group_size: int,
     b_channel_scales: torch.Tensor,
     a_token_scales: torch.Tensor,
-    out_type: Optional[torch.dtype] = None,
-    maybe_schedule: Optional[str] = None,
+    out_type: torch.dtype | None = None,
+    maybe_schedule: str | None = None,
 ) -> torch.Tensor:
     return torch.ops._C.cutlass_w4a8_mm(
         a,
@@ -1458,11 +1458,11 @@ def scaled_fp4_experts_quant(
 # fp8
 def scaled_fp8_quant(
     input: torch.Tensor,
-    scale: Optional[torch.Tensor] = None,
-    num_token_padding: Optional[int] = None,
-    scale_ub: Optional[torch.Tensor] = None,
+    scale: torch.Tensor | None = None,
+    num_token_padding: int | None = None,
+    scale_ub: torch.Tensor | None = None,
     use_per_token_if_dynamic: bool = False,
-    output: Optional[torch.Tensor] = None,
+    output: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
@@ -1489,7 +1489,7 @@ def scaled_fp8_quant(
     """
     # This code assumes batch_dim and num_tokens are flattened
     assert input.ndim == 2
-    shape: Union[tuple[int, int], torch.Size] = input.shape
+    shape: tuple[int, int] | torch.Size = input.shape
     # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
     out_dtype: torch.dtype = current_platform.fp8_dtype()
     if num_token_padding:
@@ -1520,7 +1520,7 @@ def scaled_fp8_quant(
 def allspark_repack_weight(
     qweight: torch.Tensor,
     scale: torch.Tensor,
-    zero_point: Optional[torch.Tensor] = None,
+    zero_point: torch.Tensor | None = None,
     has_zp: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
@@ -1576,7 +1576,7 @@ def allspark_w8a16_gemm(
     a: torch.Tensor,
     b_qweight: torch.Tensor,
     b_scales: torch.Tensor,
-    b_qzeros: Optional[torch.Tensor],
+    b_qzeros: torch.Tensor | None,
     n: int,
     group_size: int,
     sm_count: int,
@@ -1603,10 +1603,10 @@ def allspark_w8a16_gemm(
 # int8
 def scaled_int8_quant(
     input: torch.Tensor,
-    scale: Optional[torch.Tensor] = None,
-    azp: Optional[torch.Tensor] = None,
+    scale: torch.Tensor | None = None,
+    azp: torch.Tensor | None = None,
     symmetric: bool = True,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     """
     Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
 
@@ -1643,7 +1643,7 @@ def scaled_int8_quant(
 
 # gguf
 def ggml_dequantize(
-    W: torch.Tensor, quant_type: int, m: int, n: int, dtype: Optional[torch.dtype]
+    W: torch.Tensor, quant_type: int, m: int, n: int, dtype: torch.dtype | None
 ) -> torch.Tensor:
     return torch.ops._C.ggml_dequantize(W, quant_type, m, n, dtype)
 
@@ -1713,13 +1713,13 @@ def selective_scan_fwd(
     A: torch.Tensor,
     B: torch.Tensor,
     C: torch.Tensor,
-    D_: Optional[torch.Tensor],
-    z_: Optional[torch.Tensor],
-    delta_bias_: Optional[torch.Tensor],
+    D_: torch.Tensor | None,
+    z_: torch.Tensor | None,
+    delta_bias_: torch.Tensor | None,
     delta_softplus: bool,
-    query_start_loc: Optional[torch.Tensor],
-    cache_indices: Optional[torch.Tensor],
-    has_initial_state: Optional[torch.Tensor],
+    query_start_loc: torch.Tensor | None,
+    cache_indices: torch.Tensor | None,
+    has_initial_state: torch.Tensor | None,
     ssm_states: torch.Tensor,
     pad_slot_id: int,
 ):
@@ -1794,8 +1794,8 @@ def moe_wna16_gemm(
     output: torch.Tensor,
     b_qweight: torch.Tensor,
     b_scales: torch.Tensor,
-    b_qzeros: Optional[torch.Tensor],
-    topk_weights: Optional[torch.Tensor],
+    b_qzeros: torch.Tensor | None,
+    topk_weights: torch.Tensor | None,
     sorted_token_ids: torch.Tensor,
     experts_ids: torch.Tensor,
     num_tokens_post_pad: torch.Tensor,
@@ -1864,14 +1864,14 @@ def grouped_topk(
 
 def moe_wna16_marlin_gemm(
     input: torch.Tensor,
-    output: Optional[torch.Tensor],
+    output: torch.Tensor | None,
     b_qweight: torch.Tensor,
-    b_bias: Optional[torch.Tensor],
+    b_bias: torch.Tensor | None,
     b_scales: torch.Tensor,
-    global_scale: Optional[torch.Tensor],
-    b_qzeros: Optional[torch.Tensor],
-    g_idx: Optional[torch.Tensor],
-    perm: Optional[torch.Tensor],
+    global_scale: torch.Tensor | None,
+    b_qzeros: torch.Tensor | None,
+    g_idx: torch.Tensor | None,
+    perm: torch.Tensor | None,
     workspace: torch.Tensor,
     sorted_token_ids: torch.Tensor,
     expert_ids: torch.Tensor,
@@ -1950,12 +1950,12 @@ if hasattr(torch.ops, "_moe_C") and hasattr(torch.ops._moe_C, "marlin_gemm_moe")
     @register_fake("_moe_C::moe_wna16_marlin_gemm")
     def moe_wna16_marlin_gemm_fake(
         input: torch.Tensor,
-        output: Optional[torch.Tensor],
+        output: torch.Tensor | None,
         b_qweight: torch.Tensor,
         b_scales: torch.Tensor,
-        b_qzeros: Optional[torch.Tensor],
-        g_idx: Optional[torch.Tensor],
-        perm: Optional[torch.Tensor],
+        b_qzeros: torch.Tensor | None,
+        g_idx: torch.Tensor | None,
+        perm: torch.Tensor | None,
         workspace: torch.Tensor,
         sorted_token_ids: torch.Tensor,
         expert_ids: torch.Tensor,
@@ -2068,7 +2068,7 @@ def gather_and_maybe_dequant_cache(
     batch_size: int,
     kv_cache_dtype: str,
     scale: torch.Tensor,
-    seq_starts: Optional[torch.Tensor] = None,
+    seq_starts: torch.Tensor | None = None,
 ) -> None:
     torch.ops._C_cache_ops.gather_and_maybe_dequant_cache(
         src_cache,
@@ -2088,7 +2088,7 @@ def cp_gather_cache(
     block_table: torch.Tensor,
     cu_seq_lens: torch.Tensor,
     batch_size: int,
-    seq_starts: Optional[torch.Tensor] = None,
+    seq_starts: torch.Tensor | None = None,
 ) -> None:
     torch.ops._C_cache_ops.cp_gather_cache(
         src_cache, dst, block_table, cu_seq_lens, batch_size, seq_starts
@@ -2187,9 +2187,7 @@ def free_shared_buffer(ptr: int) -> None:
 
 
 # quick all reduce
-def init_custom_qr(
-    rank: int, world_size: int, qr_max_size: Optional[int] = None
-) -> int:
+def init_custom_qr(rank: int, world_size: int, qr_max_size: int | None = None) -> int:
     return torch.ops._C_custom_ar.init_custom_qr(rank, world_size, qr_max_size)
 
 
@@ -2247,7 +2245,7 @@ def flash_mla_with_kvcache(
     head_dim_v: int,
     tile_scheduler_metadata: torch.Tensor,
     num_splits: torch.Tensor,
-    softmax_scale: Optional[float] = None,
+    softmax_scale: float | None = None,
     causal: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
@@ -2324,7 +2322,7 @@ if hasattr(torch.ops._C, "weight_packed_linear"):
     def weight_packed_linear_fake(
         mat1: torch.Tensor,
         mat2: torch.Tensor,
-        bias: Optional[torch.Tensor],
+        bias: torch.Tensor | None,
         is_vnni: bool,
     ) -> torch.Tensor:
         return torch.empty(
@@ -2344,11 +2342,11 @@ if hasattr(torch.ops._C, "fused_experts_cpu"):
         inplace: bool,
         use_int8_w8a8: bool,
         use_fp8_w8a16: bool,
-        w1_scale: Optional[torch.Tensor],
-        w2_scale: Optional[torch.Tensor],
-        block_size: Optional[list[int]],
-        a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        w1_scale: torch.Tensor | None,
+        w2_scale: torch.Tensor | None,
+        block_size: list[int] | None,
+        a1_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         is_vnni: bool,
     ) -> torch.Tensor:
         return torch.empty_like(hidden_states)
@@ -2361,7 +2359,7 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
         mat1: torch.Tensor,
         mat2: torch.Tensor,
         scales2: torch.Tensor,
-        bias: Optional[torch.Tensor],
+        bias: torch.Tensor | None,
         out_dtype: torch.dtype,
         is_vnni: bool,
     ) -> torch.Tensor:
@@ -2372,7 +2370,7 @@ if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
 
 class CPUDNNLGEMMHandler:
     def __init__(self) -> None:
-        self.handler: Optional[int] = None
+        self.handler: int | None = None
         self.n = -1
         self.k = -1
 
@@ -2403,7 +2401,7 @@ def create_onednn_mm(
 def onednn_mm(
     dnnl_handler: CPUDNNLGEMMHandler,
     x: torch.Tensor,
-    bias: Optional[torch.Tensor],
+    bias: torch.Tensor | None,
 ) -> torch.Tensor:
     output = torch.empty((*x.shape[0:-1], dnnl_handler.n), dtype=x.dtype)
     torch.ops._C.onednn_mm(
@@ -2431,8 +2429,8 @@ def create_onednn_scaled_mm(
 
 def onednn_scaled_int8_quant(
     input: torch.Tensor,
-    scale: Optional[torch.Tensor] = None,
-    azp: Optional[torch.Tensor] = None,
+    scale: torch.Tensor | None = None,
+    azp: torch.Tensor | None = None,
     symmetric: bool = True,
 ):
     """
@@ -2471,10 +2469,10 @@ def onednn_scaled_mm(
     dnnl_handler: CPUDNNLGEMMHandler,
     x: torch.Tensor,
     output: torch.Tensor,
-    input_scale: Optional[torch.Tensor],
-    input_zp: Optional[torch.Tensor],
-    input_zp_adj: Optional[torch.Tensor],
-    bias: Optional[torch.Tensor],
+    input_scale: torch.Tensor | None,
+    input_zp: torch.Tensor | None,
+    input_zp_adj: torch.Tensor | None,
+    bias: torch.Tensor | None,
 ) -> torch.Tensor:
     torch.ops._C.onednn_scaled_mm(
         output, x, input_scale, input_zp, input_zp_adj, bias, dnnl_handler.handler
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 1f458f940a289..e773e1d13f0b8 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import torch
 
@@ -65,7 +64,7 @@ class ipex_ops:
         context_lens: torch.Tensor,
         block_size: int,
         max_context_len: int,
-        alibi_slopes: Optional[torch.Tensor],
+        alibi_slopes: torch.Tensor | None,
         kv_cache_dtype: str,
         k_scale: float,
         v_scale: float,
@@ -107,7 +106,7 @@ class ipex_ops:
         context_lens: torch.Tensor,
         block_size: int,
         max_context_len: int,
-        alibi_slopes: Optional[torch.Tensor],
+        alibi_slopes: torch.Tensor | None,
         kv_cache_dtype: str,
         k_scale: float,
         v_scale: float,
@@ -174,7 +173,7 @@ class ipex_ops:
         out: torch.Tensor,
         seqlen_q: torch.Tensor,
         seqlen_k: torch.Tensor,
-        alibi_slopes: Optional[torch.Tensor],
+        alibi_slopes: torch.Tensor | None,
         max_seqlen_q: int,
         max_seqlen_k: int,
         pdropout: float,
@@ -254,8 +253,8 @@ class ipex_ops:
         value_cache: torch.Tensor,
         slot_mapping: torch.Tensor,
         kv_cache_dtype: str,
-        k_scale: Optional[torch.Tensor] = None,
-        v_scale: Optional[torch.Tensor] = None,
+        k_scale: torch.Tensor | None = None,
+        v_scale: torch.Tensor | None = None,
         k_scale_float: float = 1.0,
         v_scale_float: float = 1.0,
     ) -> None:
@@ -283,10 +282,10 @@ class ipex_ops:
         softmax_scale: float,
         causal: bool,
         block_table: torch.Tensor,
-        alibi_slopes: Optional[torch.Tensor],
-        window_size: Optional[list[int]] = None,
-        softcap: Optional[float] = 0.0,
-        cu_seqlens_k: Optional[torch.Tensor] = None,
+        alibi_slopes: torch.Tensor | None,
+        window_size: list[int] | None = None,
+        softcap: float | None = 0.0,
+        cu_seqlens_k: torch.Tensor | None = None,
         # The following parameters are not used in ipex kernel currently,
         # we keep API compatible to CUDA's.
         scheduler_metadata=None,
@@ -295,7 +294,7 @@ class ipex_ops:
         k_descale=None,
         v_descale=None,
         num_splits=0,
-        s_aux: Optional[torch.Tensor] = None,
+        s_aux: torch.Tensor | None = None,
     ):
         if cu_seqlens_k is None:
             # cu_seqlens_k is not used in ipex kernel.
@@ -344,10 +343,10 @@ class ipex_ops:
         cache_seqlens: torch.Tensor,
         qkv_dtype=torch.bfloat16,
         headdim_v=None,
-        cu_seqlens_q: Optional[torch.Tensor] = None,
-        cu_seqlens_k_new: Optional[torch.Tensor] = None,
-        cache_leftpad: Optional[torch.Tensor] = None,
-        page_size: Optional[int] = None,
+        cu_seqlens_q: torch.Tensor | None = None,
+        cu_seqlens_k_new: torch.Tensor | None = None,
+        cache_leftpad: torch.Tensor | None = None,
+        page_size: int | None = None,
         max_seqlen_k_new=0,
         causal=False,
         window_size=(-1, -1),  # -1 means infinite context window
@@ -382,11 +381,11 @@ class ipex_ops:
     @staticmethod
     def scaled_fp8_quant(
         input: torch.Tensor,
-        scale: Optional[torch.Tensor] = None,
-        num_token_padding: Optional[int] = None,
-        scale_ub: Optional[torch.Tensor] = None,
+        scale: torch.Tensor | None = None,
+        num_token_padding: int | None = None,
+        scale_ub: torch.Tensor | None = None,
         use_per_token_if_dynamic: bool = False,
-        output: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Quantize input tensor to FP8 and return quantized tensor and scale.
@@ -414,7 +413,7 @@ class ipex_ops:
         """
         # This code assumes batch_dim and num_tokens are flattened
         assert input.ndim == 2
-        shape: Union[tuple[int, int], torch.Size] = input.shape
+        shape: tuple[int, int] | torch.Size = input.shape
         out_dtype: torch.dtype = current_platform.fp8_dtype()
         if num_token_padding:
             shape = (max(num_token_padding, input.shape[0]), shape[1])
diff --git a/vllm/assets/base.py b/vllm/assets/base.py
index 409bfc18ff8cf..abf397e1cc1ce 100644
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
@@ -3,7 +3,6 @@
 
 from functools import lru_cache
 from pathlib import Path
-from typing import Optional
 
 import vllm.envs as envs
 from vllm.connections import global_http_connection
@@ -20,7 +19,7 @@ def get_cache_dir() -> Path:
 
 
 @lru_cache
-def get_vllm_public_assets(filename: str, s3_prefix: Optional[str] = None) -> Path:
+def get_vllm_public_assets(filename: str, s3_prefix: str | None = None) -> Path:
     """
     Download an asset file from ``s3://vllm-public-assets``
     and return the path to the downloaded file.
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index 6b2ca8f867e03..a4e67ca0b63e3 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -3,7 +3,7 @@
 
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Any, ClassVar, Literal, Optional
+from typing import Any, ClassVar, Literal
 
 import cv2
 import numpy as np
@@ -137,7 +137,7 @@ class VideoAsset:
         ret = video_get_metadata(self.video_path, self.num_frames)
         return ret
 
-    def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:
+    def get_audio(self, sampling_rate: float | None = None) -> npt.NDArray:
         """
         Read audio data from the video asset, used in Qwen2.5-Omni examples.
 
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 3f23d4ef7d2c1..421b0c4beb370 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
-from typing import Generic, Optional, Protocol, TypeVar, Union
+from typing import Generic, Protocol, TypeVar
 
 import torch
 
@@ -65,7 +65,7 @@ class AttentionBackend(ABC):
         raise NotImplementedError
 
     @classmethod
-    def get_supported_kernel_block_size(cls) -> list[Union[int, MultipleOf]]:
+    def get_supported_kernel_block_size(cls) -> list[int | MultipleOf]:
         return cls.get_impl_cls().get_supported_kernel_block_size()
 
     @classmethod
@@ -158,18 +158,18 @@ class AttentionImpl(ABC, Generic[T]):
         num_heads: int,
         head_size: int,
         scale: float,
-        num_kv_heads: Optional[int] = None,
-        alibi_slopes: Optional[list[float]] = None,
-        sliding_window: Optional[int] = None,
+        num_kv_heads: int | None = None,
+        alibi_slopes: list[float] | None = None,
+        sliding_window: int | None = None,
         kv_cache_dtype: str = "auto",
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
+        kv_sharing_target_layer_name: str | None = None,
     ) -> None:
         raise NotImplementedError
 
     @staticmethod
-    def get_supported_kernel_block_size() -> list[Union[int, MultipleOf]]:
+    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
         # TODO: implement this function for all backends.
         return [MultipleOf(1)]
 
@@ -182,9 +182,9 @@ class AttentionImpl(ABC, Generic[T]):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: T,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         raise NotImplementedError
 
@@ -208,21 +208,21 @@ class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float],
+        logits_soft_cap: float | None,
         attn_type: str,
-        kv_sharing_target_layer_name: Optional[str],
+        kv_sharing_target_layer_name: str | None,
         # MLA Specific Arguments
-        q_lora_rank: Optional[int],
+        q_lora_rank: int | None,
         kv_lora_rank: int,
         qk_nope_head_dim: int,
         qk_rope_head_dim: int,
         qk_head_dim: int,
         v_head_dim: int,
         kv_b_proj: ColumnParallelLinear,
-        indexer: Optional[object] = None,
+        indexer: object | None = None,
     ) -> None:
         raise NotImplementedError
 
@@ -235,9 +235,9 @@ class MLAAttentionImpl(AttentionImpl[T], Generic[T]):
         k_pe: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: T,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         raise NotImplementedError
 
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index 313f941ebf934..dc6de483d6ae2 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -3,7 +3,6 @@
 """Attention backend registry"""
 
 import enum
-from typing import Optional
 
 from vllm.utils import resolve_obj_by_qualname
 
@@ -53,7 +52,7 @@ BACKEND_MAP = {
 }
 
 
-def register_attn_backend(backend: _Backend, class_path: Optional[str] = None):
+def register_attn_backend(backend: _Backend, class_path: str | None = None):
     """
     Decorator: register a custom attention backend into BACKEND_MAPPING.
     - If class_path is provided, use it.
@@ -98,7 +97,7 @@ def backend_to_class(backend: _Backend) -> type:
     return resolve_obj_by_qualname(backend_class_name)
 
 
-def backend_name_to_enum(backend_name: str) -> Optional[_Backend]:
+def backend_name_to_enum(backend_name: str) -> _Backend | None:
     """
     Convert a string backend name to a _Backend enum value.
 
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 46a87bdd1f7e1..4c7fa477b52ba 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -3,7 +3,6 @@
 """Attention backend utils"""
 
 from dataclasses import dataclass
-from typing import Optional
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
@@ -15,7 +14,7 @@ PAD_SLOT_ID = -1
 
 @dataclass
 class MLADims:
-    q_lora_rank: Optional[int]
+    q_lora_rank: int | None
     kv_lora_rank: int
     qk_nope_head_dim: int
     qk_rope_head_dim: int
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 9f43cb31218f7..929c3b6a4906b 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer."""
 
-from typing import Callable, Optional, cast
+from collections.abc import Callable
+from typing import cast
 
 import torch
 import torch.nn as nn
@@ -128,16 +129,16 @@ class Attention(nn.Module, AttentionLayerBase):
         num_heads: int,
         head_size: int,
         scale: float,
-        num_kv_heads: Optional[int] = None,
-        alibi_slopes: Optional[list[float]] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        logits_soft_cap: Optional[float] = None,
-        per_layer_sliding_window: Optional[int] = None,
+        num_kv_heads: int | None = None,
+        alibi_slopes: list[float] | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        logits_soft_cap: float | None = None,
+        per_layer_sliding_window: int | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        attn_backend: Optional[type[AttentionBackend]] = None,
+        kv_sharing_target_layer_name: str | None = None,
+        attn_backend: type[AttentionBackend] | None = None,
         **extra_impl_args,
     ) -> None:
         """
@@ -191,7 +192,7 @@ class Attention(nn.Module, AttentionLayerBase):
 
         # The output scale on host memory. This should be the input scale of
         # the quant op after this attention layer.
-        self._o_scale_float: Optional[float] = None
+        self._o_scale_float: float | None = None
 
         self.num_heads = num_heads
         self.head_size = head_size
@@ -319,7 +320,7 @@ class Attention(nn.Module, AttentionLayerBase):
         # For some alternate attention backends like MLA the attention output
         # shape does not match the query shape, so we optionally let the model
         # definition specify the output tensor shape.
-        output_shape: Optional[torch.Size] = None,
+        output_shape: torch.Size | None = None,
     ) -> torch.Tensor:
         """
         The KV cache is stored inside this class and is accessed via
@@ -427,7 +428,7 @@ class MultiHeadAttention(nn.Module):
         num_heads: int,
         head_size: int,
         scale: float,
-        num_kv_heads: Optional[int] = None,
+        num_kv_heads: int | None = None,
         # This has no effect, it is only here to make it easier to swap
         # between Attention and MultiHeadAttention
         prefix: str = "",
@@ -582,14 +583,14 @@ class MLAAttention(nn.Module, AttentionLayerBase):
         qk_nope_head_dim: int,
         qk_rope_head_dim: int,
         v_head_dim: int,
-        q_lora_rank: Optional[int],
+        q_lora_rank: int | None,
         kv_lora_rank: int,
         kv_b_proj: ColumnParallelLinear,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_sparse: bool = False,
-        indexer: Optional[object] = None,
+        indexer: object | None = None,
     ):
         super().__init__()
         self.num_heads = num_heads
@@ -670,7 +671,7 @@ class MLAAttention(nn.Module, AttentionLayerBase):
         self._q_scale_float = 1.0
         self._k_scale_float = 1.0
         self._v_scale_float = 1.0
-        self._o_scale_float: Optional[float] = None
+        self._o_scale_float: float | None = None
 
         self.use_sparse = use_sparse
 
@@ -688,7 +689,7 @@ class MLAAttention(nn.Module, AttentionLayerBase):
         q: torch.Tensor,
         kv_c_normed: torch.Tensor,
         k_pe: torch.Tensor,
-        output_shape: Optional[torch.Size] = None,
+        output_shape: torch.Size | None = None,
     ) -> torch.Tensor:
         if self.use_direct_call:
             forward_context: ForwardContext = get_forward_context()
@@ -888,8 +889,8 @@ def unified_attention_with_output(
     value: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    output_scale: Optional[torch.Tensor] = None,
-    output_block_scale: Optional[torch.Tensor] = None,
+    output_scale: torch.Tensor | None = None,
+    output_block_scale: torch.Tensor | None = None,
 ) -> None:
     wait_for_kv_layer_from_connector(layer_name)
     forward_context: ForwardContext = get_forward_context()
@@ -919,8 +920,8 @@ def unified_attention_with_output_fake(
     value: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    output_scale: Optional[torch.Tensor] = None,
-    output_block_scale: Optional[torch.Tensor] = None,
+    output_scale: torch.Tensor | None = None,
+    output_block_scale: torch.Tensor | None = None,
 ) -> None:
     return
 
@@ -978,8 +979,8 @@ def unified_mla_attention_with_output(
     k_pe: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    output_scale: Optional[torch.Tensor] = None,
-    output_block_scale: Optional[torch.Tensor] = None,
+    output_scale: torch.Tensor | None = None,
+    output_block_scale: torch.Tensor | None = None,
 ) -> None:
     wait_for_kv_layer_from_connector(layer_name)
     forward_context: ForwardContext = get_forward_context()
@@ -1009,8 +1010,8 @@ def unified_mla_attention_with_output_fake(
     k_pe: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    output_scale: Optional[torch.Tensor] = None,
-    output_block_scale: Optional[torch.Tensor] = None,
+    output_scale: torch.Tensor | None = None,
+    output_block_scale: torch.Tensor | None = None,
 ) -> None:
     return
 
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index 3d37e901605f9..d1f9a0437aa64 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
-from typing import ClassVar, Optional
+from typing import ClassVar
 
 import torch
 
@@ -60,11 +60,11 @@ class ChunkedLocalAttention(Attention):
         head_size: int,
         scale: float,
         attention_chunk_size: int,
-        num_kv_heads: Optional[int] = None,
-        alibi_slopes: Optional[list[float]] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        kv_sharing_target_layer_name: Optional[str] = None,
+        num_kv_heads: int | None = None,
+        alibi_slopes: list[float] | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        kv_sharing_target_layer_name: str | None = None,
         prefix: str = "",
     ):
         dtype = torch.get_default_dtype()
diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py
index fb7004f86538f..b07ffcc5ffeba 100644
--- a/vllm/attention/layers/cross_attention.py
+++ b/vllm/attention/layers/cross_attention.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 from copy import copy
-from typing import Optional
 
 import numpy as np
 import torch
@@ -138,8 +137,8 @@ class CrossAttention(Attention):
         num_heads: int,
         head_size: int,
         scale: float,
-        cache_config: Optional[CacheConfig] = None,
-        attn_type: Optional[str] = None,
+        cache_config: CacheConfig | None = None,
+        attn_type: str | None = None,
         **kwargs,
     ):
         dtype = torch.get_default_dtype()
diff --git a/vllm/attention/layers/encoder_only_attention.py b/vllm/attention/layers/encoder_only_attention.py
index f49f195563dca..1a47135d03a78 100644
--- a/vllm/attention/layers/encoder_only_attention.py
+++ b/vllm/attention/layers/encoder_only_attention.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 from copy import copy
-from typing import Optional
 
 import torch
 
@@ -60,8 +59,8 @@ class EncoderOnlyAttention(Attention):
         num_heads: int,
         head_size: int,
         scale: float,
-        cache_config: Optional[CacheConfig] = None,
-        attn_type: Optional[str] = None,
+        cache_config: CacheConfig | None = None,
+        attn_type: str | None = None,
         **kwargs,
     ):
         dtype = torch.get_default_dtype()
diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py
index 0bf354a95b1ca..2de7f71b6e306 100644
--- a/vllm/attention/ops/flashmla.py
+++ b/vllm/attention/ops/flashmla.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/flash_mla/flash_mla_interface.py
-from typing import Optional
 
 import torch
 
@@ -31,7 +30,7 @@ else:
     _flashmla_extension_C_AVAILABLE = False
 
 
-def _is_flashmla_available() -> tuple[bool, Optional[str]]:
+def _is_flashmla_available() -> tuple[bool, str | None]:
     if not _flashmla_C_AVAILABLE:
         return (
             False,
@@ -49,7 +48,7 @@ def _is_flashmla_available() -> tuple[bool, Optional[str]]:
     return True, None
 
 
-def is_flashmla_dense_supported() -> tuple[bool, Optional[str]]:
+def is_flashmla_dense_supported() -> tuple[bool, str | None]:
     """
     Return: is_supported_flag, unsupported_reason (optional).
     """
@@ -61,7 +60,7 @@ def is_flashmla_dense_supported() -> tuple[bool, Optional[str]]:
     return True, None
 
 
-def is_flashmla_sparse_supported() -> tuple[bool, Optional[str]]:
+def is_flashmla_sparse_supported() -> tuple[bool, str | None]:
     """
     Return: is_supported_flag, unsupported_reason (optional).
     """
@@ -80,9 +79,9 @@ def get_mla_metadata(
     cache_seqlens: torch.Tensor,
     num_q_tokens_per_head_k: int,
     num_heads_k: int,
-    num_heads_q: Optional[int] = None,
+    num_heads_q: int | None = None,
     is_fp8_kvcache: bool = False,
-    topk: Optional[int] = None,
+    topk: int | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Arguments:
@@ -121,12 +120,12 @@ def flash_mla_with_kvcache(
     head_dim_v: int,
     tile_scheduler_metadata: torch.Tensor,
     num_splits: torch.Tensor,
-    softmax_scale: Optional[float] = None,
+    softmax_scale: float | None = None,
     causal: bool = False,
-    descale_q: Optional[torch.Tensor] = None,
-    descale_k: Optional[torch.Tensor] = None,
+    descale_q: torch.Tensor | None = None,
+    descale_k: torch.Tensor | None = None,
     is_fp8_kvcache: bool = False,
-    indices: Optional[torch.Tensor] = None,
+    indices: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Arguments:
diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py
index 79800eb40766c..16106f3c93a6a 100644
--- a/vllm/attention/ops/merge_attn_states.py
+++ b/vllm/attention/ops/merge_attn_states.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -13,7 +12,7 @@ def merge_attn_states(
     prefix_lse: torch.Tensor,
     suffix_output: torch.Tensor,
     suffix_lse: torch.Tensor,
-    output_lse: Optional[torch.Tensor] = None,
+    output_lse: torch.Tensor | None = None,
 ) -> None:
     # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel
     # is not support for FP8 dtype, fallback to use Triton kernel.
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 4db7d1a3a3258..8e010ffba32ec 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -27,7 +26,7 @@ class PagedAttentionMetadata:
 
     # (batch_size,). The length of sequences (entire tokens seen so far) per
     # sequence.
-    seq_lens_tensor: Optional[torch.Tensor]
+    seq_lens_tensor: torch.Tensor | None
     # Maximum sequence length in the batch. 0 if it is prefill-only batch.
     max_decode_seq_len: int
     # (batch_size, max_blocks_per_seq).
@@ -36,7 +35,7 @@ class PagedAttentionMetadata:
     # in the kv cache. Each block can contain up to block_size tokens.
     # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
     # captured.
-    block_tables: Optional[torch.Tensor]
+    block_tables: torch.Tensor | None
 
 
 class PagedAttention:
@@ -102,7 +101,7 @@ class PagedAttention:
         kv_cache_dtype: str,
         num_kv_heads: int,
         scale: float,
-        alibi_slopes: Optional[torch.Tensor],
+        alibi_slopes: torch.Tensor | None,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
         tp_rank: int = 0,
@@ -211,8 +210,8 @@ class PagedAttention:
         query_start_loc: torch.Tensor,
         seq_lens_tensor: torch.Tensor,
         max_query_len: int,
-        alibi_slopes: Optional[torch.Tensor],
-        sliding_window: Optional[int],
+        alibi_slopes: torch.Tensor | None,
+        sliding_window: int | None,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
     ) -> torch.Tensor:
diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py
index c358b5971f865..8fc034dd721b2 100644
--- a/vllm/attention/ops/rocm_aiter_mla.py
+++ b/vllm/attention/ops/rocm_aiter_mla.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -30,9 +29,9 @@ def aiter_mla_decode_fwd(
     sm_scale: float,
     qo_indptr: torch.Tensor,
     max_seqlen_qo: int,
-    kv_indptr: Optional[torch.Tensor] = None,
-    kv_indices: Optional[torch.Tensor] = None,
-    kv_last_page_lens: Optional[torch.Tensor] = None,
+    kv_indptr: torch.Tensor | None = None,
+    kv_indices: torch.Tensor | None = None,
+    kv_last_page_lens: torch.Tensor | None = None,
     logit_cap: float = 0.0,
 ):
     torch.ops.vllm.rocm_aiter_mla_decode_fwd(
@@ -55,9 +54,9 @@ def mla_decode_fwd_impl(
     o: torch.Tensor,
     qo_indptr: torch.Tensor,
     max_seqlen_qo: int,
-    kv_indptr: Optional[torch.Tensor] = None,
-    kv_indices: Optional[torch.Tensor] = None,
-    kv_last_page_lens: Optional[torch.Tensor] = None,
+    kv_indptr: torch.Tensor | None = None,
+    kv_indices: torch.Tensor | None = None,
+    kv_last_page_lens: torch.Tensor | None = None,
     sm_scale: float = 1.0,
     logit_cap: float = 0.0,
 ) -> None:
@@ -83,9 +82,9 @@ def mla_decode_fwd_fake(
     o: torch.Tensor,
     qo_indptr: torch.Tensor,
     max_seqlen_qo: int,
-    kv_indptr: Optional[torch.Tensor] = None,
-    kv_indices: Optional[torch.Tensor] = None,
-    kv_last_page_lens: Optional[torch.Tensor] = None,
+    kv_indptr: torch.Tensor | None = None,
+    kv_indices: torch.Tensor | None = None,
+    kv_last_page_lens: torch.Tensor | None = None,
     sm_scale: float = 1.0,
     logit_cap: float = 0.0,
 ) -> None:
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
index 069cfcaf00aaf..5c1ce68dde1b9 100644
--- a/vllm/attention/ops/rocm_aiter_paged_attn.py
+++ b/vllm/attention/ops/rocm_aiter_paged_attn.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import aiter as rocm_aiter
 import torch
@@ -62,7 +61,7 @@ class AITERPagedAttention(PagedAttention):
         kv_cache_dtype: str,
         num_kv_heads: int,
         scale: float,
-        alibi_slopes: Optional[torch.Tensor],
+        alibi_slopes: torch.Tensor | None,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
         tp_rank: int = 0,
diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
index d29f92f8cecb2..3c87a24afd9c7 100644
--- a/vllm/attention/ops/triton_merge_attn_states.py
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -15,7 +14,7 @@ def merge_attn_states(
     prefix_lse: torch.Tensor,
     suffix_output: torch.Tensor,
     suffix_lse: torch.Tensor,
-    output_lse: Optional[torch.Tensor] = None,
+    output_lse: torch.Tensor | None = None,
 ) -> None:
     num_tokens = output.shape[0]
     num_query_heads = output.shape[1]
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 7dfe6ffda6a80..1872741339043 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -6,7 +6,6 @@ from collections.abc import Generator
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import cache
-from typing import Optional, Union
 
 import torch
 
@@ -19,7 +18,7 @@ from vllm.utils import STR_BACKEND_ENV_VAR, resolve_obj_by_qualname
 logger = init_logger(__name__)
 
 
-def get_env_variable_attn_backend() -> Optional[_Backend]:
+def get_env_variable_attn_backend() -> _Backend | None:
     """
     Get the backend override specified by the vLLM attention
     backend environment variable, if one is specified.
@@ -40,10 +39,10 @@ def get_env_variable_attn_backend() -> Optional[_Backend]:
 #
 # THIS SELECTION TAKES PRECEDENCE OVER THE
 # VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
-forced_attn_backend: Optional[_Backend] = None
+forced_attn_backend: _Backend | None = None
 
 
-def global_force_attn_backend(attn_backend: Optional[_Backend]) -> None:
+def global_force_attn_backend(attn_backend: _Backend | None) -> None:
     """
     Force all attention operations to use a specified backend.
 
@@ -58,7 +57,7 @@ def global_force_attn_backend(attn_backend: Optional[_Backend]) -> None:
     forced_attn_backend = attn_backend
 
 
-def get_global_forced_attn_backend() -> Optional[_Backend]:
+def get_global_forced_attn_backend() -> _Backend | None:
     """
     Get the currently-forced choice of attention backend,
     or None if auto-selection is currently enabled.
@@ -77,7 +76,7 @@ class _IsSupported:
 
 
 def is_attn_backend_supported(
-    attn_backend: Union[str, type[AttentionBackend]],
+    attn_backend: str | type[AttentionBackend],
     head_size: int,
     dtype: torch.dtype,
     *,
@@ -127,7 +126,7 @@ def is_attn_backend_supported(
 def get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
-    kv_cache_dtype: Optional[str],
+    kv_cache_dtype: str | None,
     block_size: int,
     use_mla: bool = False,
     has_sink: bool = False,
@@ -154,7 +153,7 @@ def get_attn_backend(
 def _cached_get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
-    kv_cache_dtype: Optional[str],
+    kv_cache_dtype: str | None,
     block_size: int,
     use_v1: bool = False,
     use_mla: bool = False,
@@ -167,12 +166,12 @@ def _cached_get_attn_backend(
     # THIS SELECTION OVERRIDES THE VLLM_ATTENTION_BACKEND
     # ENVIRONMENT VARIABLE.
     selected_backend = None
-    backend_by_global_setting: Optional[_Backend] = get_global_forced_attn_backend()
+    backend_by_global_setting: _Backend | None = get_global_forced_attn_backend()
     if backend_by_global_setting is not None:
         selected_backend = backend_by_global_setting
     else:
         # Check the environment variable and override if specified
-        backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+        backend_by_env_var: str | None = envs.VLLM_ATTENTION_BACKEND
         if backend_by_env_var is not None:
             if backend_by_env_var.endswith("_VLLM_V1"):
                 logger.warning(
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index e13afd46ee96b..b92b822c1d19f 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 from vllm import envs
 from vllm.logger import init_logger
@@ -21,7 +20,7 @@ elif current_platform.is_xpu():
     get_scheduler_metadata = ops.get_scheduler_metadata
 
 
-def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]:
+def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
     # import here to avoid circular dependencies
     from vllm.platforms import current_platform
 
diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index e0ba863b9210e..fcd2d1f0e01ab 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional
 
 from vllm.logprobs import Logprob
 from vllm.lora.request import LoRARequest
@@ -22,13 +22,13 @@ class BeamSearchSequence:
     # The tokens include the prompt.
     tokens: list[int]
     logprobs: list[dict[int, Logprob]]
-    lora_request: Optional[LoRARequest] = None
+    lora_request: LoRARequest | None = None
     cum_logprob: float = 0.0
-    text: Optional[str] = None
-    finish_reason: Optional[str] = None
-    stop_reason: Union[int, str, None] = None
+    text: str | None = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
     multi_modal_data: Optional["MultiModalDataDict"] = None
-    mm_processor_kwargs: Optional[dict[str, Any]] = None
+    mm_processor_kwargs: dict[str, Any] | None = None
 
 
 @dataclass
@@ -45,8 +45,8 @@ class BeamSearchInstance:
     def __init__(
         self,
         prompt_tokens: list[int],
-        lora_request: Optional[LoRARequest] = None,
-        logprobs: Optional[list[dict[int, Logprob]]] = None,
+        lora_request: LoRARequest | None = None,
+        logprobs: list[dict[int, Logprob]] | None = None,
         **kwargs,
     ):
         self.beams: list[BeamSearchSequence] = [
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 8e71a7bfb1293..331d31c1d0e63 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -21,13 +21,13 @@ import logging
 import math
 import random
 from abc import ABC, abstractmethod
-from collections.abc import Iterator, Mapping
+from collections.abc import Callable, Iterator, Mapping
 from contextlib import suppress
 from copy import deepcopy
 from dataclasses import dataclass
 from functools import cache
 from io import BytesIO
-from typing import Any, Callable, Optional, Union, cast
+from typing import Any, cast
 
 import numpy as np
 from PIL import Image
@@ -75,12 +75,12 @@ class SampleRequest:
     Represents a single inference request for benchmarking.
     """
 
-    prompt: Union[str, list[str]]
+    prompt: str | list[str]
     prompt_len: int
     expected_output_len: int
-    multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None
-    lora_request: Optional[LoRARequest] = None
-    request_id: Optional[str] = None
+    multi_modal_data: MultiModalDataDict | dict | list[dict] | None = None
+    lora_request: LoRARequest | None = None
+    request_id: str | None = None
 
 
 # -----------------------------------------------------------------------------
@@ -94,7 +94,7 @@ class BenchmarkDataset(ABC):
 
     def __init__(
         self,
-        dataset_path: Optional[str] = None,
+        dataset_path: str | None = None,
         random_seed: int = DEFAULT_SEED,
         disable_shuffle: bool = False,
         **kwargs,
@@ -119,7 +119,7 @@ class BenchmarkDataset(ABC):
     def apply_multimodal_chat_transformation(
         self,
         prompt: str,
-        mm_content: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None,
+        mm_content: MultiModalDataDict | dict | list[dict] | None = None,
     ) -> list[dict]:
         """
         Transform a prompt and optional multimodal content into a chat format.
@@ -154,9 +154,9 @@ class BenchmarkDataset(ABC):
 
     def get_random_lora_request(
         self,
-        max_loras: Optional[int] = None,
-        lora_path: Optional[str] = None,
-    ) -> Optional[LoRARequest]:
+        max_loras: int | None = None,
+        lora_path: str | None = None,
+    ) -> LoRARequest | None:
         """
         Optionally select a random LoRA request.
 
@@ -384,7 +384,7 @@ def gen_prompt_decode_to_target_len(
     target_token_len: int,
     max_retry: int = 10,
     add_special_tokens: bool = False,
-    rng: Optional[np.random.Generator] = None,
+    rng: np.random.Generator | None = None,
 ) -> tuple[str, list[int]]:
     """
     Ensure decoded-then-encoded prompt length matches the target token length.
@@ -1054,9 +1054,9 @@ class ShareGPTDataset(BenchmarkDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        lora_path: Optional[str] = None,
-        max_loras: Optional[int] = None,
-        output_len: Optional[int] = None,
+        lora_path: str | None = None,
+        max_loras: int | None = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -1766,9 +1766,9 @@ class CustomDataset(BenchmarkDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        lora_path: Optional[str] = None,
-        max_loras: Optional[int] = None,
-        output_len: Optional[int] = None,
+        lora_path: str | None = None,
+        max_loras: int | None = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
         skip_chat_template: bool = False,
         request_id_prefix: str = "",
@@ -1997,8 +1997,8 @@ class BurstGPTDataset(BenchmarkDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        max_loras: Optional[int] = None,
-        lora_path: Optional[str] = None,
+        max_loras: int | None = None,
+        lora_path: str | None = None,
         request_id_prefix: str = "",
         no_oversample: bool = False,
         **kwargs,
@@ -2034,15 +2034,15 @@ class BurstGPTDataset(BenchmarkDataset):
 class HuggingFaceDataset(BenchmarkDataset):
     """Base class for datasets hosted on HuggingFace."""
 
-    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
+    SUPPORTED_DATASET_PATHS: set[str] | dict[str, Callable] = set()
 
     def __init__(
         self,
         dataset_path: str,
         dataset_split: str,
         no_stream: bool = False,
-        dataset_subset: Optional[str] = None,
-        hf_name: Optional[str] = None,
+        dataset_subset: str | None = None,
+        hf_name: str | None = None,
         **kwargs,
     ) -> None:
         super().__init__(dataset_path=dataset_path, **kwargs)
@@ -2083,7 +2083,7 @@ class ConversationDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -2152,7 +2152,7 @@ class VisionArenaDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -2206,7 +2206,7 @@ class MMVUDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -2267,7 +2267,7 @@ class InstructCoderDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
         skip_chat_template: bool = False,
         request_id_prefix: str = "",
@@ -2331,7 +2331,7 @@ class MTBenchDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
         skip_chat_template: bool = False,
         request_id_prefix: str = "",
@@ -2397,7 +2397,7 @@ class BlazeditDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         skip_chat_template: bool = False,
         request_id_prefix: str = "",
         no_oversample: bool = False,
@@ -2478,7 +2478,7 @@ class AIMODataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         request_id_prefix: str = "",
         no_oversample: bool = False,
         **kwargs,
@@ -2660,7 +2660,7 @@ class ASRDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         request_id_prefix: str = "",
         no_oversample: bool = False,
         **kwargs,
@@ -2738,7 +2738,7 @@ class MLPerfDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         request_id_prefix: str = "",
         no_oversample: bool = False,
         **kwargs,
@@ -2902,7 +2902,7 @@ class MMStarDataset(HuggingFaceDataset):
         self,
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
-        output_len: Optional[int] = None,
+        output_len: int | None = None,
         enable_multimodal_chat: bool = False,
         request_id_prefix: str = "",
         no_oversample: bool = False,
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index 7692697fe768a..b4f1751837f48 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -7,7 +7,7 @@ import dataclasses
 import json
 import os
 import time
-from typing import Any, Optional
+from typing import Any
 
 import numpy as np
 from tqdm import tqdm
@@ -127,7 +127,7 @@ def main(args: argparse.Namespace):
                 ),
             )
 
-    def run_to_completion(profile_dir: Optional[str] = None):
+    def run_to_completion(profile_dir: str | None = None):
         if profile_dir:
             llm.start_profile()
             llm_generate()
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
index 28146ce6200d1..2e5c100a3031d 100644
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -10,7 +10,7 @@ import time
 import traceback
 from collections.abc import Awaitable
 from dataclasses import dataclass, field
-from typing import Any, Literal, Optional, Protocol, Union
+from typing import Any, Literal, Protocol
 
 import aiohttp
 import regex as re
@@ -69,14 +69,14 @@ class RequestFuncInput:
     prompt_len: int
     output_len: int
     model: str
-    model_name: Optional[str] = None
-    logprobs: Optional[int] = None
-    extra_headers: Optional[dict] = None
-    extra_body: Optional[dict] = None
-    multi_modal_content: Optional[Union[dict, list[dict]]] = None
+    model_name: str | None = None
+    logprobs: int | None = None
+    extra_headers: dict | None = None
+    extra_body: dict | None = None
+    multi_modal_content: dict | list[dict] | None = None
     ignore_eos: bool = False
-    language: Optional[str] = None
-    request_id: Optional[str] = None
+    language: str | None = None
+    request_id: str | None = None
 
 
 @dataclass
@@ -100,14 +100,14 @@ class RequestFunc(Protocol):
         self,
         request_func_input: RequestFuncInput,
         session: aiohttp.ClientSession,
-        pbar: Optional[tqdm] = None,
+        pbar: tqdm | None = None,
     ) -> Awaitable[RequestFuncOutput]: ...
 
 
 def _validate_api_url(
     api_url: str,
     api_name: str,
-    expected_suffixes: Union[str, set[str]],
+    expected_suffixes: str | set[str],
 ) -> None:
     if isinstance(expected_suffixes, str):
         expected_suffixes = {expected_suffixes}
@@ -141,7 +141,7 @@ def _update_headers_common(
 async def async_request_openai_completions(
     request_func_input: RequestFuncInput,
     session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     """The async request function for the OpenAI Completions API.
 
@@ -279,7 +279,7 @@ def _get_chat_content(
 async def async_request_openai_chat_completions(
     request_func_input: RequestFuncInput,
     session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
     mm_position: Literal["first", "last"] = "last",
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
@@ -376,7 +376,7 @@ async def async_request_openai_chat_completions(
 async def async_request_openai_audio(
     request_func_input: RequestFuncInput,
     session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     # Lazy import without PlaceholderModule to avoid vllm dep.
     import soundfile
@@ -489,7 +489,7 @@ async def _run_openai_embeddings(
     api_url: str,
     payload: dict[str, Any],
     headers: dict[str, Any],
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     output = RequestFuncOutput()
     st = time.perf_counter()
@@ -517,7 +517,7 @@ async def _run_openai_embeddings(
 async def async_request_openai_embeddings(
     request_func_input: RequestFuncInput,
     session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     _validate_api_url(api_url, "OpenAI Embeddings API", "embeddings")
@@ -548,7 +548,7 @@ async def async_request_openai_embeddings(
 async def async_request_openai_embeddings_chat(
     request_func_input: RequestFuncInput,
     session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
     mm_position: Literal["first", "last"] = "last",
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
@@ -627,7 +627,7 @@ def _preprocess_vlm2vec(request_func_input: RequestFuncInput):
 async def async_request_openai_embeddings_clip(
     request_func_input: RequestFuncInput,
     session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     _preprocess_clip(request_func_input)
 
@@ -641,7 +641,7 @@ async def async_request_openai_embeddings_clip(
 async def async_request_openai_embeddings_vlm2vec(
     request_func_input: RequestFuncInput,
     session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     _preprocess_vlm2vec(request_func_input)
 
@@ -656,7 +656,7 @@ async def async_request_openai_embeddings_vlm2vec(
 async def async_request_infinity_embeddings(
     request_func_input: RequestFuncInput,
     session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     _validate_api_url(api_url, "Infinity Embeddings API", "embeddings")
@@ -697,7 +697,7 @@ async def async_request_infinity_embeddings(
 async def async_request_infinity_embeddings_clip(
     request_func_input: RequestFuncInput,
     session: aiohttp.ClientSession,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     _preprocess_clip(request_func_input)
 
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index c3c45f05f800b..c52e384a40023 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -31,7 +31,7 @@ from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
-from typing import Any, Literal, Optional
+from typing import Any, Literal
 
 import aiohttp
 import numpy as np
@@ -107,9 +107,9 @@ class EmbedBenchmarkMetrics:
 
 
 def _get_current_request_rate(
-    ramp_up_strategy: Optional[Literal["linear", "exponential"]],
-    ramp_up_start_rps: Optional[int],
-    ramp_up_end_rps: Optional[int],
+    ramp_up_strategy: Literal["linear", "exponential"] | None,
+    ramp_up_start_rps: int | None,
+    ramp_up_end_rps: int | None,
     request_index: int,
     total_requests: int,
     request_rate: float,
@@ -135,9 +135,9 @@ async def get_request(
     input_requests: list[SampleRequest],
     request_rate: float,
     burstiness: float = 1.0,
-    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
-    ramp_up_start_rps: Optional[int] = None,
-    ramp_up_end_rps: Optional[int] = None,
+    ramp_up_strategy: Literal["linear", "exponential"] | None = None,
+    ramp_up_start_rps: int | None = None,
+    ramp_up_end_rps: int | None = None,
 ) -> AsyncGenerator[tuple[SampleRequest, float], None]:
     """
     Asynchronously generates requests at a specified rate
@@ -474,7 +474,7 @@ async def benchmark(
     model_name: str,
     tokenizer: PreTrainedTokenizerBase,
     input_requests: list[SampleRequest],
-    logprobs: Optional[int],
+    logprobs: int | None,
     request_rate: float,
     burstiness: float,
     disable_tqdm: bool,
@@ -483,13 +483,13 @@ async def benchmark(
     selected_percentiles: list[float],
     ignore_eos: bool,
     goodput_config_dict: dict[str, float],
-    max_concurrency: Optional[int],
-    lora_modules: Optional[Iterable[str]],
-    extra_headers: Optional[dict],
-    extra_body: Optional[dict],
-    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
-    ramp_up_start_rps: Optional[int] = None,
-    ramp_up_end_rps: Optional[int] = None,
+    max_concurrency: int | None,
+    lora_modules: Iterable[str] | None,
+    extra_headers: dict | None,
+    extra_body: dict | None,
+    ramp_up_strategy: Literal["linear", "exponential"] | None = None,
+    ramp_up_start_rps: int | None = None,
+    ramp_up_end_rps: int | None = None,
     ready_check_timeout_sec: int = 600,
 ):
     try:
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index b0f63fd2c7227..01c6824ac91f8 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -9,7 +9,7 @@ import os
 import random
 import time
 import warnings
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import uvloop
@@ -43,7 +43,7 @@ def run_vllm(
     engine_args: EngineArgs,
     do_profile: bool,
     disable_detokenize: bool = False,
-) -> tuple[float, Optional[list[RequestOutput]]]:
+) -> tuple[float, list[RequestOutput] | None]:
     from vllm import LLM, SamplingParams
 
     llm = LLM(**dataclasses.asdict(engine_args))
@@ -56,7 +56,7 @@ def run_vllm(
         " prompt_len and expected_output_len for all requests."
     )
     # Add the requests to the engine.
-    prompts: list[Union[TextPrompt, TokensPrompt]] = []
+    prompts: list[TextPrompt | TokensPrompt] = []
     sampling_params: list[SamplingParams] = []
     for request in requests:
         prompt = (
@@ -79,7 +79,7 @@ def run_vllm(
                 detokenize=not disable_detokenize,
             )
         )
-    lora_requests: Optional[list[LoRARequest]] = None
+    lora_requests: list[LoRARequest] | None = None
     if engine_args.enable_lora:
         lora_requests = [request.lora_request for request in requests]
 
@@ -197,9 +197,9 @@ async def run_vllm_async(
         )
 
         # Add the requests to the engine.
-        prompts: list[Union[TextPrompt, TokensPrompt]] = []
+        prompts: list[TextPrompt | TokensPrompt] = []
         sampling_params: list[SamplingParams] = []
-        lora_requests: list[Optional[LoRARequest]] = []
+        lora_requests: list[LoRARequest | None] = []
         for request in requests:
             prompt = (
                 TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"])
@@ -696,7 +696,7 @@ def main(args: argparse.Namespace):
     )
     requests = get_requests(args, tokenizer)
     is_multi_modal = any(request.multi_modal_data is not None for request in requests)
-    request_outputs: Optional[list[RequestOutput]] = None
+    request_outputs: list[RequestOutput] | None = None
     if args.backend == "vllm":
         if args.async_engine:
             elapsed_time = uvloop.run(
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 826ab42462c3b..e559fdb397fa3 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -7,9 +7,9 @@ import hashlib
 import os
 import pprint
 import time
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from contextlib import contextmanager
-from typing import Any, Callable, Optional
+from typing import Any
 
 import torch
 import torch.fx as fx
@@ -75,7 +75,7 @@ class CompilerManager:
     """
 
     def __init__(self, compilation_config: CompilationConfig):
-        self.cache: dict[tuple[Optional[int], int, str], Any] = dict()
+        self.cache: dict[tuple[int | None, int, str], Any] = dict()
         self.is_cache_updated = False
         self.compilation_config = compilation_config
         self.compiler = make_compiler(compilation_config)
@@ -84,7 +84,7 @@ class CompilerManager:
         return self.compiler.compute_hash(vllm_config)
 
     @contextmanager
-    def compile_context(self, runtime_shape: Optional[int] = None):
+    def compile_context(self, runtime_shape: int | None = None):
         """Provide compilation context for the duration of compilation to set
         any torch global properties we want to scope to a single Inductor
         compilation (e.g. partition rules, pass context)."""
@@ -145,8 +145,8 @@ class CompilerManager:
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: Optional[int] = None,
-    ) -> Optional[Callable]:
+        runtime_shape: int | None = None,
+    ) -> Callable | None:
         if (runtime_shape, graph_index, self.compiler.name) not in self.cache:
             return None
         handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
@@ -178,7 +178,7 @@ class CompilerManager:
         compilation_config: CompilationConfig,
         graph_index: int = 0,
         num_graphs: int = 1,
-        runtime_shape: Optional[int] = None,
+        runtime_shape: int | None = None,
     ) -> Any:
         if graph_index == 0:
             # before compiling the first graph, record the start time
@@ -656,7 +656,8 @@ class VllmBackend:
 
         graph_path = os.path.join(local_cache_dir, "computation_graph.py")
         if not os.path.exists(graph_path):
-            # code adapted from https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 # noqa
+            # code adapted from
+            # https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30
             # use `print_readable` because it can include submodules
             src = (
                 "from __future__ import annotations\nimport torch\n"
diff --git a/vllm/compilation/base_static_graph.py b/vllm/compilation/base_static_graph.py
index 6ee82e74963d9..12f1ff5bc0447 100644
--- a/vllm/compilation/base_static_graph.py
+++ b/vllm/compilation/base_static_graph.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Protocol
+from collections.abc import Callable
+from typing import Any, Protocol
 
 from vllm.config import CUDAGraphMode, VllmConfig
 
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 988a1069cd9e7..1dc8888607f54 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from importlib.util import find_spec
-from typing import Optional
 
 import torch
 import torch._inductor.pattern_matcher as pm
@@ -432,7 +431,7 @@ class AsyncTPPass(VllmPatternMatcherPass):
 
         self.dump_patterns(config, self.patterns)
 
-    def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
+    def is_applicable_for_shape(self, shape: int | None) -> bool:
         # only do replace for specific shapes
         tp_size = get_tensor_model_parallel_world_size()
         return shape is not None and shape % tp_size == 0
@@ -485,10 +484,10 @@ if flashinfer_comm is not None:
         max_token_num: int,
         pattern_code: int,
         fuse_rms_quant: bool,
-        norm_out: Optional[torch.Tensor] = None,
-        quant_out: Optional[torch.Tensor] = None,
-        scale_out: Optional[torch.Tensor] = None,
-        scale_factor: Optional[torch.Tensor] = None,
+        norm_out: torch.Tensor | None = None,
+        quant_out: torch.Tensor | None = None,
+        scale_out: torch.Tensor | None = None,
+        scale_factor: torch.Tensor | None = None,
     ) -> None:
         num_tokens, hidden_size = allreduce_in.shape
         element_size = allreduce_in.element_size()
@@ -589,10 +588,10 @@ if flashinfer_comm is not None:
         max_token_num: int,
         pattern_code: int,
         fuse_rms_quant: bool,
-        norm_out: Optional[torch.Tensor] = None,
-        quant_out: Optional[torch.Tensor] = None,
-        scale_out: Optional[torch.Tensor] = None,
-        scale_factor: Optional[torch.Tensor] = None,
+        norm_out: torch.Tensor | None = None,
+        quant_out: torch.Tensor | None = None,
+        scale_out: torch.Tensor | None = None,
+        scale_factor: torch.Tensor | None = None,
     ) -> None:
         pass
 
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index e5fa2518b87be..4553007027e39 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -4,8 +4,9 @@ import contextlib
 import copy
 import hashlib
 import os
+from collections.abc import Callable
 from contextlib import ExitStack
-from typing import Any, Callable, Optional
+from typing import Any
 from unittest.mock import patch
 
 import torch
@@ -62,9 +63,9 @@ class CompilerInterface:
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: Optional[int] = None,
-        key: Optional[str] = None,
-    ) -> tuple[Optional[Callable], Optional[Any]]:
+        runtime_shape: int | None = None,
+        key: str | None = None,
+    ) -> tuple[Callable | None, Any | None]:
         """
         Compile the graph with the given example inputs and compiler config,
         with a runtime shape. If the `runtime_shape` is None, it means
@@ -97,7 +98,7 @@ class CompilerInterface:
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: Optional[int] = None,
+        runtime_shape: int | None = None,
     ) -> Callable:
         """
         Load the compiled function from the handle.
@@ -191,9 +192,9 @@ class InductorStandaloneAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: Optional[int] = None,
-        key: Optional[str] = None,
-    ) -> tuple[Optional[Callable], Optional[Any]]:
+        runtime_shape: int | None = None,
+        key: str | None = None,
+    ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_inductor_compiles += 1
         current_config = {}
         if compiler_config is not None:
@@ -229,7 +230,7 @@ class InductorStandaloneAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: Optional[int] = None,
+        runtime_shape: int | None = None,
     ) -> Callable:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
@@ -293,9 +294,9 @@ class InductorAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: Optional[int] = None,
-        key: Optional[str] = None,
-    ) -> tuple[Optional[Callable], Optional[Any]]:
+        runtime_shape: int | None = None,
+        key: str | None = None,
+    ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_inductor_compiles += 1
         from torch._inductor.compile_fx import compile_fx
 
@@ -492,7 +493,7 @@ class InductorAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: Optional[int] = None,
+        runtime_shape: int | None = None,
     ) -> Callable:
         assert isinstance(handle, tuple)
         assert isinstance(handle[0], str)
@@ -610,9 +611,9 @@ class EagerAdaptor(CompilerInterface):
         graph: fx.GraphModule,
         example_inputs: list[Any],
         compiler_config: dict[str, Any],
-        runtime_shape: Optional[int] = None,
-        key: Optional[str] = None,
-    ) -> tuple[Optional[Callable], Optional[Any]]:
+        runtime_shape: int | None = None,
+        key: str | None = None,
+    ) -> tuple[Callable | None, Any | None]:
         compilation_counter.num_eager_compiles += 1
         # we don't need to compile the graph, just return the graph itself.
         # It does not support caching, return None for the handle.
diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
index 4c3ac9e56a377..fe20a5f7e63e7 100644
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
+from collections.abc import Callable
 from contextlib import ExitStack
-from typing import Any, Callable, Optional
+from typing import Any
 from unittest.mock import patch
 
 import torch
@@ -24,12 +25,12 @@ logger = init_logger(__name__)
 @dataclasses.dataclass
 class CUDAGraphEntry:
     batch_descriptor: BatchDescriptor
-    cudagraph: Optional[torch.cuda.CUDAGraph] = None
-    output: Optional[Any] = None
+    cudagraph: torch.cuda.CUDAGraph | None = None
+    output: Any | None = None
 
     # for cudagraph debugging, track the input addresses
     # during capture, and check if they are the same during replay
-    input_addresses: Optional[list[int]] = None
+    input_addresses: list[int] | None = None
 
 
 @dataclasses.dataclass
@@ -69,7 +70,7 @@ class CUDAGraphWrapper:
         runnable: Callable,
         vllm_config: VllmConfig,
         runtime_mode: CUDAGraphMode,
-        cudagraph_options: Optional[CUDAGraphOptions] = None,
+        cudagraph_options: CUDAGraphOptions | None = None,
     ):
         self.runnable = runnable
         self.vllm_config = vllm_config
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 20bf63c804012..fe19d4e851294 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -6,7 +6,8 @@ import hashlib
 import inspect
 import os
 import sys
-from typing import Callable, Optional, TypeVar, Union, overload
+from collections.abc import Callable
+from typing import TypeVar, overload
 from unittest.mock import patch
 
 import torch
@@ -61,14 +62,14 @@ def _should_ignore_torch_compile(cls) -> bool:
 @overload
 def support_torch_compile(
     *,
-    enable_if: Optional[Callable[[VllmConfig], bool]] = None,
+    enable_if: Callable[[VllmConfig], bool] | None = None,
 ) -> Callable[[_T], _T]: ...
 
 
 @overload
 def support_torch_compile(
     *,
-    dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]],
+    dynamic_arg_dims: dict[str, int | list[int]] | None,
 ) -> Callable[[_T], _T]: ...
 
 
@@ -77,11 +78,11 @@ def support_torch_compile(cls: _T) -> _T: ...
 
 
 def support_torch_compile(
-    cls: Optional[_T] = None,
+    cls: _T | None = None,
     *,
-    dynamic_arg_dims: Optional[dict[str, Union[int, list[int]]]] = None,
-    enable_if: Optional[Callable[[VllmConfig], bool]] = None,
-) -> Union[Callable[[_T], _T], _T]:
+    dynamic_arg_dims: dict[str, int | list[int]] | None = None,
+    enable_if: Callable[[VllmConfig], bool] | None = None,
+) -> Callable[[_T], _T] | _T:
     """
     A decorator to add support for compiling the forward method of a class.
 
@@ -147,9 +148,9 @@ def support_torch_compile(
             for k, v in sig.parameters.items():
                 if v.annotation in [
                     torch.Tensor,
-                    Optional[torch.Tensor],
+                    torch.Tensor | None,
                     IntermediateTensors,
-                    Optional[IntermediateTensors],
+                    IntermediateTensors | None,
                 ]:
                     inferred_dynamic_arg_dims[k] = 0
 
@@ -209,8 +210,8 @@ def _verify_source_unchanged(source_info, vllm_config) -> None:
 
 def _support_torch_compile(
     cls: _T,
-    dynamic_arg_dims: dict[str, Union[int, list[int]]],
-    enable_if: Optional[Callable[[VllmConfig], bool]] = None,
+    dynamic_arg_dims: dict[str, int | list[int]],
+    enable_if: Callable[[VllmConfig], bool] | None = None,
 ) -> _T:
     """
     A decorator to add support for compiling the forward method of a class.
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 0dffb343f9a28..29462d9ff0e50 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -3,7 +3,6 @@
 
 import operator
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
@@ -150,7 +149,7 @@ class FixFunctionalizationPass(VllmInductorPass):
         )
         self.nodes_to_remove.clear()
 
-    def _remove(self, node_or_nodes: Union[torch.fx.Node, Iterable[torch.fx.Node]]):
+    def _remove(self, node_or_nodes: torch.fx.Node | Iterable[torch.fx.Node]):
         """
         Stage a node (or nodes) for removal at the end of the pass.
         """
@@ -163,8 +162,8 @@ class FixFunctionalizationPass(VllmInductorPass):
         self,
         graph: torch.fx.Graph,
         node: torch.fx.Node,
-        mutated_args: dict[int, Union[torch.fx.Node, str]],
-        args: Optional[tuple[Union[torch.fx.Node, str], ...]] = None,
+        mutated_args: dict[int, torch.fx.Node | str],
+        args: tuple[torch.fx.Node | str, ...] | None = None,
     ):
         """
         De-functionalize a node by replacing it with a call to the original.
@@ -176,7 +175,7 @@ class FixFunctionalizationPass(VllmInductorPass):
         self._remove(node)
 
     def replace_users_with_mutated_args(
-        self, node: torch.fx.Node, mutated_args: dict[int, Union[torch.fx.Node, str]]
+        self, node: torch.fx.Node, mutated_args: dict[int, torch.fx.Node | str]
     ):
         """
         Replace all getitem users of the auto-functionalized node with the
@@ -207,7 +206,7 @@ class FixFunctionalizationPass(VllmInductorPass):
         self,
         graph: torch.fx.Graph,
         node: torch.fx.Node,
-        args: Optional[tuple[Union[torch.fx.Node, str], ...]] = None,
+        args: tuple[torch.fx.Node | str, ...] | None = None,
     ):
         """
         Insert a new defunctionalized node into the graph before node.
diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py
index 114b53c74c48f..45fe88a5f4d38 100644
--- a/vllm/compilation/fx_utils.py
+++ b/vllm/compilation/fx_utils.py
@@ -3,7 +3,6 @@
 
 import operator
 from collections.abc import Iterable, Iterator
-from typing import Optional
 
 from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
@@ -19,9 +18,7 @@ def is_auto_func(node: fx.Node, op: OpOverload) -> bool:
 
 
 # Returns the first specified node with the given op (if it exists)
-def find_specified_fn_maybe(
-    nodes: Iterable[fx.Node], op: OpOverload
-) -> Optional[fx.Node]:
+def find_specified_fn_maybe(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node | None:
     for node in nodes:
         if node.target == op:
             return node
@@ -36,7 +33,7 @@ def find_specified_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
 
 
 # Returns the first auto_functionalized node with the given op (if it exists)
-def find_auto_fn_maybe(nodes: Iterable[fx.Node], op: OpOverload) -> Optional[fx.Node]:
+def find_auto_fn_maybe(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node | None:
     for node in nodes:
         if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
             return node
@@ -52,7 +49,7 @@ def find_auto_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
 
 # Returns the getitem node that extracts the idx-th element from node
 # (if it exists)
-def find_getitem_maybe(node: fx.Node, idx: int) -> Optional[fx.Node]:
+def find_getitem_maybe(node: fx.Node, idx: int) -> fx.Node | None:
     for user in node.users:
         if is_func(user, operator.getitem) and user.args[1] == idx:
             return user
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 9085448d23978..b9ec3cf6c5edb 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -6,8 +6,9 @@ import hashlib
 import inspect
 import json
 import types
+from collections.abc import Callable
 from contextlib import contextmanager
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import torch
 from torch import fx
@@ -27,7 +28,7 @@ _pass_context = None
 
 
 class PassContext:
-    def __init__(self, runtime_shape: Optional[int]):
+    def __init__(self, runtime_shape: int | None):
         self.runtime_shape = runtime_shape
 
 
@@ -38,7 +39,7 @@ def get_pass_context() -> PassContext:
 
 
 @contextmanager
-def pass_context(runtime_shape: Optional[int]):
+def pass_context(runtime_shape: int | None):
     """A context manager that stores the current pass context,
     usually it is a list of sizes to specialize.
     """
@@ -67,7 +68,7 @@ class InductorPass(CustomGraphPass):
         return InductorPass.hash_source(self)
 
     @staticmethod
-    def hash_source(*srcs: Union[str, Any]):
+    def hash_source(*srcs: str | Any):
         """
         Utility method to hash the sources of functions or objects.
         :param srcs: strings or objects to add to the hash.
@@ -95,7 +96,7 @@ class InductorPass(CustomGraphPass):
         encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
         return hashlib.sha256(encoded).hexdigest()
 
-    def is_applicable_for_shape(self, shape: Optional[int]):
+    def is_applicable_for_shape(self, shape: int | None):
         return True
 
 
@@ -105,9 +106,7 @@ class CallableInductorPass(InductorPass):
     implementation of the UUID.
     """
 
-    def __init__(
-        self, callable: Callable[[fx.Graph], None], uuid: Optional[Any] = None
-    ):
+    def __init__(self, callable: Callable[[fx.Graph], None], uuid: Any | None = None):
         self.callable = callable
         self._uuid = self.hash_source(callable) if uuid is None else uuid
 
diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py
index 45668c7af3151..42b8d3daac985 100644
--- a/vllm/compilation/noop_elimination.py
+++ b/vllm/compilation/noop_elimination.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Union
 
 import torch.fx
 from torch import SymInt
@@ -105,9 +104,7 @@ class NoOpEliminationPass(VllmInductorPass):
         logger.debug("Removed %s no-op reshapes and slices", count)
 
     # ---------------------- Shape comparison helpers ----------------------
-    def dims_equivalent(
-        self, dim: Union[int, SymInt], i_dim: Union[int, SymInt]
-    ) -> bool:
+    def dims_equivalent(self, dim: int | SymInt, i_dim: int | SymInt) -> bool:
         """
         This function checks if two dimensions are equivalent.
         :param dim: The dimension arg to reshape/slice
@@ -127,7 +124,7 @@ class NoOpEliminationPass(VllmInductorPass):
         return False
 
     def all_dims_equivalent(
-        self, dims: Iterable[Union[int, SymInt]], i_dims: Iterable[Union[int, SymInt]]
+        self, dims: Iterable[int | SymInt], i_dims: Iterable[int | SymInt]
     ) -> bool:
         dims_ = list(dims)
         i_dims_ = list(i_dims)
diff --git a/vllm/compilation/partition_rules.py b/vllm/compilation/partition_rules.py
index c17a5bd4480c9..5ea1b30860f59 100644
--- a/vllm/compilation/partition_rules.py
+++ b/vllm/compilation/partition_rules.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import contextlib
 from typing import TYPE_CHECKING
 
@@ -16,7 +14,7 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
 
 
-def resolve_defined_ops(op_names: list[str]) -> list[torch._ops.OpOverload]:
+def resolve_defined_ops(op_names: list[str]) -> list["torch._ops.OpOverload"]:
     """Resolve operator names to OpOverload objects.
 
     Skips operators that fail to resolve (e.g., operators not registered or
@@ -49,7 +47,7 @@ def resolve_defined_ops(op_names: list[str]) -> list[torch._ops.OpOverload]:
 
 
 @contextlib.contextmanager
-def inductor_partition_rule_context(overloads: list[torch._ops.OpOverload]):
+def inductor_partition_rule_context(overloads: list["torch._ops.OpOverload"]):
     """Context manager to temporarily register Inductor partition rules.
 
     Registers custom partition rules for specified operators, forcing the
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index 61551766a1c52..2931580afbbb0 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import torch.fx as fx
 
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index 2bc705c3b9a9c..8ff530cebd82d 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 import torch._inductor.pattern_matcher as pm
@@ -27,7 +26,7 @@ class _RMSNormAndQuantOpHelper:
         epsilon: float,
         dtype: torch.dtype,
         device: str,
-        quant_op: Optional[torch._ops.OpOverload] = None,
+        quant_op: torch._ops.OpOverload | None = None,
         **kwargs,
     ):
         self.epsilon = epsilon
@@ -110,7 +109,7 @@ class _SequenceParallelPatternHelper(_RMSNormAndQuantOpHelper):
         epsilon: float,
         dtype: torch.dtype,
         device: str,
-        quant_op: Optional[torch._ops.OpOverload] = None,
+        quant_op: torch._ops.OpOverload | None = None,
         **kwargs,
     ):
         super().__init__(epsilon, dtype, device, quant_op=quant_op, **kwargs)
@@ -483,7 +482,7 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
             ).register(self.patterns)
         self.dump_patterns(config, self.patterns)
 
-    def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
+    def is_applicable_for_shape(self, shape: int | None) -> bool:
         tp_size = get_tensor_model_parallel_world_size()
         return shape is not None and shape % tp_size == 0
 
diff --git a/vllm/compilation/torch25_custom_graph_pass.py b/vllm/compilation/torch25_custom_graph_pass.py
index ea8b56cf9d6ac..1031856cdf008 100644
--- a/vllm/compilation/torch25_custom_graph_pass.py
+++ b/vllm/compilation/torch25_custom_graph_pass.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
@@ -23,7 +23,7 @@ class Torch25CustomGraphPass(ABC):  # noqa (redefinition)
         """
 
     @abstractmethod
-    def uuid(self) -> Optional[Any]:
+    def uuid(self) -> Any | None:
         """
         Return an ID to uniquely identify your custom pass implementation.
         Return None to skip inductor code caching entirely.
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index 5aa08220bc2d7..ad83e7b3e0c2e 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -3,7 +3,7 @@
 import functools
 import operator
 import time
-from typing import ClassVar, Optional
+from typing import ClassVar
 
 import regex as re
 import torch
@@ -24,7 +24,7 @@ class VllmInductorPass(InductorPass):
     It provides timing, logging, and dumping utilities.
     """
 
-    dump_prefix: ClassVar[Optional[int]] = None
+    dump_prefix: ClassVar[int | None] = None
     """Keep track of pass index for debug dump ordering."""
 
     def __init__(self, config: VllmConfig):
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 2007b655e2642..b4a0d89af0d6d 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -4,9 +4,9 @@
 import os
 import sys
 from abc import abstractmethod
+from collections.abc import Callable
 from contextlib import contextmanager
 from types import CodeType
-from typing import Callable, Optional
 
 import torch
 
@@ -31,7 +31,7 @@ class TorchCompileWrapperWithCustomDispatcher:
     """
 
     def __init__(
-        self, compiled_callable: Optional[Callable] = None, compilation_level: int = 0
+        self, compiled_callable: Callable | None = None, compilation_level: int = 0
     ):
         vllm_config = get_current_vllm_config()
         self.vllm_config = vllm_config
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index fd47d5c8f976f..04b1e7bf2ac1d 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -3,7 +3,7 @@
 
 import hashlib
 from dataclasses import field
-from typing import TYPE_CHECKING, Any, Literal, Optional
+from typing import TYPE_CHECKING, Any, Literal
 
 from pydantic import Field, SkipValidation, field_validator
 from pydantic.dataclasses import dataclass
@@ -58,13 +58,13 @@ class CacheConfig:
     is_attention_free: bool = False
     """Whether the model is attention-free. This is primarily set in
     `ModelConfig` and that value should be manually duplicated here."""
-    num_gpu_blocks_override: Optional[int] = None
+    num_gpu_blocks_override: int | None = None
     """Number of GPU blocks to use. This overrides the profiled `num_gpu_blocks`
     if specified. Does nothing if `None`. Used for testing preemption."""
-    sliding_window: Optional[int] = None
+    sliding_window: int | None = None
     """Sliding window size for the KV cache. This is primarily set in
     `ModelConfig` and that value should be manually duplicated here."""
-    enable_prefix_caching: Optional[bool] = None
+    enable_prefix_caching: bool | None = None
     """Whether to enable prefix caching. Enabled by default for V1."""
     prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
     """Set the hash algorithm for prefix caching:\n
@@ -84,12 +84,12 @@ class CacheConfig:
     """This enables dynamic calculation of `k_scale` and `v_scale` when
     kv_cache_dtype is fp8. If `False`, the scales will be loaded from the model
     checkpoint if available. Otherwise, the scales will default to 1.0."""
-    cpu_kvcache_space_bytes: Optional[int] = None
+    cpu_kvcache_space_bytes: int | None = None
     """(CPU backend only) CPU key-value cache space."""
-    mamba_page_size_padded: Optional[int] = None
+    mamba_page_size_padded: int | None = None
     """ Optional override for mamba page size; used by hybrid mamba/attention
     models to ensure exact alignment with attention page size."""
-    mamba_block_size: Optional[int] = None
+    mamba_block_size: int | None = None
     """Size of a contiguous cache block in number of tokens for mamba cache."""
     mamba_cache_dtype: MambaDType = "auto"
     """The data type to use for the Mamba cache (both the conv as well as the
@@ -101,9 +101,9 @@ class CacheConfig:
     for the ssm state will be determined by mamba_cache_dtype."""
 
     # Will be set after profiling.
-    num_gpu_blocks: Optional[int] = field(default=None, init=False)
+    num_gpu_blocks: int | None = field(default=None, init=False)
     """The number of blocks to allocate for GPU memory."""
-    num_cpu_blocks: Optional[int] = field(default=None, init=False)
+    num_cpu_blocks: int | None = field(default=None, init=False)
     """The number of blocks to allocate for CPU memory."""
 
     kv_sharing_fast_prefill: bool = False
@@ -116,7 +116,7 @@ class CacheConfig:
     necessary for implementing this optimization in some models (e.g. Gemma3n)
     """
 
-    kv_cache_memory_bytes: Optional[int] = None
+    kv_cache_memory_bytes: int | None = None
     """Size of KV Cache per GPU in bytes. By default, this is set to None
     and vllm can automatically infer the kv cache size based on
     gpu_memory_utilization. However, users may want to manually specify
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 4209f3a9731c1..657c430049f86 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -4,9 +4,10 @@
 import enum
 import hashlib
 from collections import Counter
+from collections.abc import Callable
 from dataclasses import asdict, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
+from typing import TYPE_CHECKING, Any, ClassVar
 
 from pydantic import TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
@@ -168,7 +169,7 @@ class CompilationConfig:
     """
 
     # Top-level Compilation control
-    level: Optional[int] = None
+    level: int | None = None
     """The level of compilation:
 
     - None: If None, we will select the default compilation level.
@@ -177,7 +178,7 @@ class CompilationConfig:
     - 1: dynamo as is.
     - 2: dynamo once.
     - 3: piecewise compilation."""
-    debug_dump_path: Optional[Path] = None
+    debug_dump_path: Path | None = None
     """The path to dump the debug information."""
     cache_dir: str = ""
     """The directory to store the compiled graph, to accelerate Inductor
@@ -208,7 +209,7 @@ class CompilationConfig:
     By default, all custom ops are enabled when running without Inductor and
     disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
     Inductor generates (fused) Triton kernels for disabled custom ops."""
-    splitting_ops: Optional[list[str]] = None
+    splitting_ops: list[str] | None = None
     """A list of ops to exclude from cudagraphs, used in piecewise compilation.
 
     The behavior depends on use_inductor_graph_partition:
@@ -238,7 +239,7 @@ class CompilationConfig:
         are compiled using configurations in inductor_compile_config.
 
     This setting is ignored if level<PIECEWISE."""
-    compile_sizes: Optional[list[Union[int, str]]] = None
+    compile_sizes: list[int | str] | None = None
     """Sizes to compile for inductor. In addition
     to integers, it also supports "cudagraph_capture_sizes" to
     specify the sizes for cudagraph capture."""
@@ -253,7 +254,7 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    cudagraph_mode: Optional[CUDAGraphMode] = None
+    cudagraph_mode: CUDAGraphMode | None = None
     """
     The mode of the cudagraph:
 
@@ -308,7 +309,7 @@ class CompilationConfig:
     It means the first several runs will be treated as warmup runs.
     Only after that, the execution will be recorded, and the recorded
     cudagraph will be used for subsequent runs."""
-    cudagraph_capture_sizes: Optional[list[int]] = None
+    cudagraph_capture_sizes: list[int] | None = None
     """Sizes to capture cudagraph.
     - None (default): capture sizes are inferred from vllm config.
     - list[int]: capture sizes are specified as given."""
@@ -320,7 +321,7 @@ class CompilationConfig:
     internally managed buffer. Default is False. 
     Note that this flag is only effective when cudagraph_mode is PIECEWISE.
     """
-    full_cuda_graph: Optional[bool] = False
+    full_cuda_graph: bool | None = False
     """whether to use a full cuda graph for the entire forward pass rather than
     splitting certain operations such as attention into subgraphs. Thus this
     flag cannot be used together with splitting_ops. This may provide
@@ -544,7 +545,7 @@ class CompilationConfig:
                     "(where 'op' is the registered op name)"
                 )
 
-    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
+    def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
         if self.level == CompilationLevel.NO_COMPILATION:
             raise ValueError("No compilation level is set.")
 
diff --git a/vllm/config/device.py b/vllm/config/device.py
index 4b66424795413..e85cd15de8cf4 100644
--- a/vllm/config/device.py
+++ b/vllm/config/device.py
@@ -3,7 +3,7 @@
 
 import hashlib
 from dataclasses import field
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal
 
 import torch
 from pydantic import ConfigDict, SkipValidation
@@ -19,7 +19,7 @@ Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
 class DeviceConfig:
     """Configuration for the device to use for vLLM execution."""
 
-    device: SkipValidation[Optional[Union[Device, torch.device]]] = "auto"
+    device: SkipValidation[Device | torch.device | None] = "auto"
     """Device type for vLLM execution.
     This parameter is deprecated and will be
     removed in a future release.
diff --git a/vllm/config/kv_events.py b/vllm/config/kv_events.py
index 1c6bdffa1281d..dc829113a8aa8 100644
--- a/vllm/config/kv_events.py
+++ b/vllm/config/kv_events.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 from pydantic.dataclasses import dataclass
 
@@ -26,7 +25,7 @@ class KVEventsConfig:
     """The zmq endpoint to use for publishing kv events.
     """
 
-    replay_endpoint: Optional[str] = None
+    replay_endpoint: str | None = None
     """The zmq endpoint to use for replaying kv events.
     """
 
diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py
index b33294fd66f78..ba17d9a65f287 100644
--- a/vllm/config/kv_transfer.py
+++ b/vllm/config/kv_transfer.py
@@ -4,7 +4,7 @@
 import hashlib
 import uuid
 from dataclasses import field
-from typing import Any, Literal, Optional, get_args
+from typing import Any, Literal, get_args
 
 from pydantic.dataclasses import dataclass
 
@@ -20,14 +20,14 @@ KVRole = Literal[KVProducer, KVConsumer]
 class KVTransferConfig:
     """Configuration for distributed KV cache transfer."""
 
-    kv_connector: Optional[str] = None
+    kv_connector: str | None = None
     """The KV connector for vLLM to transmit KV caches between vLLM instances.
     """
 
-    engine_id: Optional[str] = None
+    engine_id: str | None = None
     """The engine id for KV transfers."""
 
-    kv_buffer_device: Optional[str] = "cuda"
+    kv_buffer_device: str | None = "cuda"
     """The device used by kv connector to buffer the KV cache. Choices are 
     'cuda' and 'cpu'."""
 
@@ -35,11 +35,11 @@ class KVTransferConfig:
     """The buffer size for TorchDistributedConnector. Measured in number of
     bytes. Recommended value: 1e9 (about 1GB)."""
 
-    kv_role: Optional[KVRole] = None
+    kv_role: KVRole | None = None
     """Whether this vLLM instance produces, consumes KV cache, or both. Choices
     are 'kv_producer', 'kv_consumer', and 'kv_both'."""
 
-    kv_rank: Optional[int] = None
+    kv_rank: int | None = None
     """The rank of this vLLM instance in the KV cache transfer. Typical value:
     0 for prefill instance, 1 for decode instance.
     Currently only 1P1D is supported."""
@@ -57,7 +57,7 @@ class KVTransferConfig:
     kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
     """any extra config that the connector may need."""
 
-    kv_connector_module_path: Optional[str] = None
+    kv_connector_module_path: str | None = None
     """The Python module path to dynamically load the KV connector from.
     Only supported in V1."""
 
diff --git a/vllm/config/load.py b/vllm/config/load.py
index aa35bc63d5d10..d625c1ac987e7 100644
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
 from pydantic import Field, field_validator
 from pydantic.dataclasses import dataclass
@@ -25,7 +25,7 @@ logger = init_logger(__name__)
 class LoadConfig:
     """Configuration for loading the model weights."""
 
-    load_format: Union[str, LoadFormats] = "auto"
+    load_format: str | LoadFormats = "auto"
     """The format of the model weights to load:\n
     - "auto" will try to load the weights in the safetensors format and fall
     back to the pytorch bin format if safetensors format is not available.\n
@@ -48,7 +48,7 @@ class LoadConfig:
     - "mistral" will load weights from consolidated safetensors files used by
     Mistral models.
     - Other custom values can be supported via plugins."""
-    download_dir: Optional[str] = None
+    download_dir: str | None = None
     """Directory to download and load the weights, default to the default
     cache directory of Hugging Face."""
     safetensors_load_strategy: str = "lazy"
@@ -64,23 +64,19 @@ class LoadConfig:
       was quantized using torchao and saved using safetensors.
       Needs torchao >= 0.14.0
     """
-    model_loader_extra_config: Union[dict, TensorizerConfig] = Field(
-        default_factory=dict
-    )
+    model_loader_extra_config: dict | TensorizerConfig = Field(default_factory=dict)
     """Extra config for model loader. This will be passed to the model loader
     corresponding to the chosen load_format."""
-    device: Optional[str] = None
+    device: str | None = None
     """Device to which model weights will be loaded, default to
     device_config.device"""
-    ignore_patterns: Union[list[str], str] = Field(
-        default_factory=lambda: ["original/**/*"]
-    )
+    ignore_patterns: list[str] | str = Field(default_factory=lambda: ["original/**/*"])
     """The list of patterns to ignore when loading the model. Default to
     "original/**/*" to avoid repeated loading of llama's checkpoints."""
     use_tqdm_on_load: bool = True
     """Whether to enable tqdm for showing progress bar when loading model
     weights."""
-    pt_load_map_location: Union[str, dict[str, str]] = "cpu"
+    pt_load_map_location: str | dict[str, str] = "cpu"
     """
     pt_load_map_location: the map location for loading pytorch checkpoint, to
     support loading checkpoints can only be loaded on certain devices like
@@ -115,8 +111,8 @@ class LoadConfig:
 
     @field_validator("ignore_patterns", mode="after")
     def _validate_ignore_patterns(
-        cls, ignore_patterns: Union[list[str], str]
-    ) -> Union[list[str], str]:
+        cls, ignore_patterns: list[str] | str
+    ) -> list[str] | str:
         if ignore_patterns != ["original/**/*"] and len(ignore_patterns) > 0:
             logger.info(
                 "Ignoring the following patterns when downloading weights: %s",
diff --git a/vllm/config/lora.py b/vllm/config/lora.py
index c531618a186d9..2f9d638542b65 100644
--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
-from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
 import torch
 from pydantic import ConfigDict, Field, model_validator
@@ -42,10 +42,10 @@ class LoRAConfig:
     parallelism. Enabling this will use the fully sharded layers. At high
     sequence length, max rank or tensor parallel size, this is likely faster.
     """
-    max_cpu_loras: Optional[int] = None
+    max_cpu_loras: int | None = None
     """Maximum number of LoRAs to store in CPU memory. Must be >= than
     `max_loras`."""
-    lora_dtype: Union[torch.dtype, LoRADType] = "auto"
+    lora_dtype: torch.dtype | LoRADType = "auto"
     """Data type for LoRA. If auto, will default to base model dtype."""
     lora_extra_vocab_size: LoRAExtraVocabSize = Field(
         default=256,
@@ -60,7 +60,7 @@ class LoRAConfig:
     lora_vocab_padding_size: ClassVar[int] = (
         current_platform.get_lora_vocab_padding_size()
     )
-    default_mm_loras: Optional[dict[str, str]] = None
+    default_mm_loras: dict[str, str] | None = None
     """Dictionary mapping specific modalities to LoRA model paths; this field
     is only applicable to multimodal models and should be leveraged when a
     model always expects a LoRA to be active when a given modality is present.
diff --git a/vllm/config/model.py b/vllm/config/model.py
index d0c027e47675c..a2dcf52107546 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -4,18 +4,10 @@
 import hashlib
 import json
 import warnings
+from collections.abc import Callable
 from dataclasses import InitVar, field
 from importlib.util import find_spec
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Literal,
-    Optional,
-    Union,
-    cast,
-    get_args,
-)
+from typing import TYPE_CHECKING, Any, Literal, cast, get_args
 
 import torch
 from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
@@ -89,7 +81,7 @@ ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
 ]
-HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], PretrainedConfig]]
+HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig]
 ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"]
 
 _RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
@@ -121,7 +113,7 @@ class ModelConfig:
     """Convert the model using adapters defined in
     [vllm.model_executor.models.adapters][]. The most common use case is to
     adapt a text generation model to be used for pooling tasks."""
-    task: Optional[TaskOption] = None
+    task: TaskOption | None = None
     """[DEPRECATED] The task to use the model for. If the model supports more
     than one model runner, this is used to select which model runner to run.
 
@@ -139,7 +131,7 @@ class ModelConfig:
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model
     and tokenizer."""
-    dtype: Union[ModelDType, torch.dtype] = "auto"
+    dtype: ModelDType | torch.dtype = "auto"
     """Data type for model weights and activations:\n
     - "auto" will use FP16 precision for FP32 and FP16 models, and BF16
     precision for BF16 models.\n
@@ -148,33 +140,33 @@ class ModelConfig:
     - "bfloat16" for a balance between precision and range.\n
     - "float" is shorthand for FP32 precision.\n
     - "float32" for FP32 precision."""
-    seed: Optional[int] = None
+    seed: int | None = None
     """Random seed for reproducibility. Initialized to None in V0, but
     initialized to 0 in V1."""
-    hf_config_path: Optional[str] = None
+    hf_config_path: str | None = None
     """Name or path of the Hugging Face config to use. If unspecified, model
     name or path will be used."""
     allowed_local_media_path: str = ""
     """Allowing API requests to read local images or videos from directories
     specified by the server file system. This is a security risk. Should only
     be enabled in trusted environments."""
-    allowed_media_domains: Optional[list[str]] = None
+    allowed_media_domains: list[str] | None = None
     """If set, only media URLs that belong to this domain can be used for 
     multi-modal inputs. """
-    revision: Optional[str] = None
+    revision: str | None = None
     """The specific model version to use. It can be a branch name, a tag name,
     or a commit id. If unspecified, will use the default version."""
-    code_revision: Optional[str] = None
+    code_revision: str | None = None
     """The specific revision to use for the model code on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
     rope_scaling: dict[str, Any] = field(default_factory=dict)
     """RoPE scaling configuration. For example,
     `{"rope_type":"dynamic","factor":2.0}`."""
-    rope_theta: Optional[float] = None
+    rope_theta: float | None = None
     """RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE
     theta improves the performance of the scaled model."""
-    tokenizer_revision: Optional[str] = None
+    tokenizer_revision: str | None = None
     """The specific revision to use for the tokenizer on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
@@ -187,9 +179,9 @@ class ModelConfig:
     - 1k -> 1000\n
     - 1K -> 1024\n
     - 25.6k -> 25,600"""
-    spec_target_max_model_len: Optional[int] = None
+    spec_target_max_model_len: int | None = None
     """Specify the maximum length for spec decoding draft models."""
-    quantization: SkipValidation[Optional[QuantizationMethods]] = None
+    quantization: SkipValidation[QuantizationMethods | None] = None
     """Method used to quantize the weights. If `None`, we first check the
     `quantization_config` attribute in the model config file. If that is
     `None`, we assume the model weights are not quantized and use `dtype` to
@@ -230,7 +222,7 @@ class ModelConfig:
     """If `True`, enables passing text embeddings as inputs via the
     `prompt_embeds` key. Note that enabling this will double the time required
     for graph compilation."""
-    served_model_name: Optional[Union[str, list[str]]] = None
+    served_model_name: str | list[str] | None = None
     """The model name(s) used in the API. If multiple names are provided, the
     server will respond to any of the provided names. The model name in the
     model field of a response will be the first name in this list. If not
@@ -238,20 +230,20 @@ class ModelConfig:
     that this name(s) will also be used in `model_name` tag content of
     prometheus metrics, if multiple names provided, metrics tag will take the
     first one."""
-    config_format: Union[str, ConfigFormat] = "auto"
+    config_format: str | ConfigFormat = "auto"
     """The format of the model config to load:\n
     - "auto" will try to load the config in hf format if available else it
     will try to load in mistral format.\n
     - "hf" will load the config in hf format.\n
     - "mistral" will load the config in mistral format."""
-    hf_token: Optional[Union[bool, str]] = None
+    hf_token: bool | str | None = None
     """The token to use as HTTP bearer authorization for remote files . If
     `True`, will use the token generated when running `huggingface-cli login`
     (stored in `~/.huggingface`)."""
     hf_overrides: HfOverrides = field(default_factory=dict)
     """If a dictionary, contains arguments to be forwarded to the Hugging Face
     config. If a callable, it is called to update the HuggingFace config."""
-    logits_processor_pattern: Optional[str] = None
+    logits_processor_pattern: str | None = None
     """Optional regex pattern specifying valid logits processor qualified names
     that can be passed with the `logits_processors` extra completion argument.
     Defaults to `None`, which allows no processors."""
@@ -269,7 +261,7 @@ class ModelConfig:
     `--generation-config vllm`, only the override parameters are used."""
     enable_sleep_mode: bool = False
     """Enable sleep mode for the engine (only cuda platform is supported)."""
-    model_impl: Union[str, ModelImpl] = "auto"
+    model_impl: str | ModelImpl = "auto"
     """Which implementation of the model to use:\n
     - "auto" will try to use the vLLM implementation, if it exists, and fall
     back to the Transformers implementation if no vLLM implementation is
@@ -278,36 +270,36 @@ class ModelConfig:
     - "transformers" will use the Transformers model implementation.\n
     - "terratorch" will use the TerraTorch model implementation.
     """
-    override_attention_dtype: Optional[str] = None
+    override_attention_dtype: str | None = None
     """Override dtype for attention"""
-    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
+    logits_processors: list[str | type[LogitsProcessor]] | None = None
     """One or more logits processors' fully-qualified class names or class
     definitions"""
-    io_processor_plugin: Optional[str] = None
+    io_processor_plugin: str | None = None
     """IOProcessor plugin name to load at model startup"""
 
     # Pooler config
-    pooler_config: Optional[PoolerConfig] = None
+    pooler_config: PoolerConfig | None = None
     """Pooler config which controls the behaviour of output pooling in pooling
     models."""
-    override_pooler_config: Optional[Union[dict, PoolerConfig]] = None
+    override_pooler_config: dict | PoolerConfig | None = None
     """[DEPRECATED] Use `pooler_config` instead. This field will be removed in
     v0.12.0 or v1.0.0, whichever is sooner."""
 
     # Multimodal config and init vars
-    multimodal_config: Optional[MultiModalConfig] = None
+    multimodal_config: MultiModalConfig | None = None
     """Configuration for multimodal model. If `None`, this will be inferred
     from the architecture of `self.model`."""
-    limit_mm_per_prompt: InitVar[Optional[dict[str, Union[int, dict[str, int]]]]] = None
-    media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
-    mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None
-    mm_processor_cache_gb: InitVar[Optional[float]] = None
-    mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None
-    mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None
-    mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None
-    interleave_mm_strings: InitVar[Optional[bool]] = None
-    skip_mm_profiling: InitVar[Optional[bool]] = None
-    video_pruning_rate: InitVar[Optional[float]] = None
+    limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
+    media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
+    mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
+    mm_processor_cache_gb: InitVar[float | None] = None
+    mm_processor_cache_type: InitVar[MMCacheType | None] = None
+    mm_shm_cache_max_object_size_mb: InitVar[int | None] = None
+    mm_encoder_tp_mode: InitVar[MMEncoderTPMode | None] = None
+    interleave_mm_strings: InitVar[bool | None] = None
+    skip_mm_profiling: InitVar[bool | None] = None
+    video_pruning_rate: InitVar[float | None] = None
 
     def compute_hash(self) -> str:
         """
@@ -369,7 +361,7 @@ class ModelConfig:
 
     def _update_nested(
         self,
-        target: Union["PretrainedConfig", dict[str, Any]],
+        target: PretrainedConfig | dict[str, Any],
         updates: dict[str, Any],
     ) -> None:
         """Recursively updates a config or dict with nested updates."""
@@ -397,7 +389,7 @@ class ModelConfig:
 
     def _apply_dict_overrides(
         self,
-        config: "PretrainedConfig",
+        config: PretrainedConfig,
         overrides: dict[str, Any],
     ) -> None:
         """Apply dict overrides, handling both nested configs and dict values."""
@@ -415,16 +407,16 @@ class ModelConfig:
     def __post_init__(
         self,
         # Multimodal config init vars
-        limit_mm_per_prompt: Optional[dict[str, int]],
-        media_io_kwargs: Optional[dict[str, dict[str, Any]]],
-        mm_processor_kwargs: Optional[dict[str, Any]],
-        mm_processor_cache_gb: Optional[float],
-        mm_processor_cache_type: Optional[MMCacheType],
-        mm_shm_cache_max_object_size_mb: Optional[int],
-        mm_encoder_tp_mode: Optional[MMEncoderTPMode],
-        interleave_mm_strings: Optional[bool],
-        skip_mm_profiling: Optional[bool],
-        video_pruning_rate: Optional[float],
+        limit_mm_per_prompt: dict[str, int] | None,
+        media_io_kwargs: dict[str, dict[str, Any]] | None,
+        mm_processor_kwargs: dict[str, Any] | None,
+        mm_processor_cache_gb: float | None,
+        mm_processor_cache_type: MMCacheType | None,
+        mm_shm_cache_max_object_size_mb: int | None,
+        mm_encoder_tp_mode: MMEncoderTPMode | None,
+        interleave_mm_strings: bool | None,
+        skip_mm_profiling: bool | None,
+        video_pruning_rate: float | None,
     ) -> None:
         # Set the default seed to 0 in V1.
         # NOTE(woosuk): In V0, we set the default seed to None because the
@@ -1209,7 +1201,7 @@ class ModelConfig:
                 "Supported models implement the `SupportsPP` interface."
             )
 
-    def get_sliding_window(self) -> Optional[int]:
+    def get_sliding_window(self) -> int | None:
         """Get the sliding window size from the HF text config if present."""
         return getattr(self.hf_text_config, "sliding_window", None)
 
@@ -1479,7 +1471,7 @@ class ModelConfig:
                     f"{block_type.value} layers"
                 )
 
-    def get_mamba_chunk_size(self) -> Optional[int]:
+    def get_mamba_chunk_size(self) -> int | None:
         """
         Returns the mamba chunk size if it exists
         """
@@ -1715,9 +1707,7 @@ class ModelConfig:
         return max_model_len
 
 
-def get_served_model_name(
-    model: str, served_model_name: Optional[Union[str, list[str]]]
-):
+def get_served_model_name(model: str, served_model_name: str | list[str] | None):
     """
     If the input is a non-empty list, the first model_name in
     `served_model_name` is taken.
@@ -1761,9 +1751,9 @@ def iter_architecture_defaults():
 def try_match_architecture_defaults(
     architecture: str,
     *,
-    runner_type: Optional[RunnerType] = None,
-    convert_type: Optional[ConvertType] = None,
-) -> Optional[tuple[str, tuple[RunnerType, ConvertType]]]:
+    runner_type: RunnerType | None = None,
+    convert_type: ConvertType | None = None,
+) -> tuple[str, tuple[RunnerType, ConvertType]] | None:
     for suffix, (
         default_runner_type,
         default_convert_type,
@@ -1817,7 +1807,7 @@ def _find_dtype(
     model_id: str,
     config: PretrainedConfig,
     *,
-    revision: Optional[str],
+    revision: str | None,
 ):
     # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
     # because config.torch_dtype can be None.
@@ -1902,10 +1892,10 @@ def _resolve_auto_dtype(
 def _get_and_verify_dtype(
     model_id: str,
     config: PretrainedConfig,
-    dtype: Union[str, torch.dtype],
+    dtype: str | torch.dtype,
     *,
     is_pooling_model: bool,
-    revision: Optional[str] = None,
+    revision: str | None = None,
 ) -> torch.dtype:
     config_dtype = _find_dtype(model_id, config, revision=revision)
     model_type = config.model_type
@@ -1947,7 +1937,7 @@ def _get_and_verify_dtype(
 def _get_head_dtype(
     config: PretrainedConfig, dtype: torch.dtype, runner_type: str
 ) -> torch.dtype:
-    head_dtype: Optional[Union[str, torch.dtype]] = getattr(config, "head_dtype", None)
+    head_dtype: str | torch.dtype | None = getattr(config, "head_dtype", None)
 
     if head_dtype == "model":
         return dtype
@@ -1970,12 +1960,12 @@ def _get_head_dtype(
 
 def _get_and_verify_max_len(
     hf_config: PretrainedConfig,
-    tokenizer_config: Optional[dict],
-    max_model_len: Optional[int],
+    tokenizer_config: dict | None,
+    max_model_len: int | None,
     disable_sliding_window: bool,
-    sliding_window: Optional[int],
-    spec_target_max_model_len: Optional[int] = None,
-    encoder_config: Optional[Any] = None,
+    sliding_window: int | None,
+    spec_target_max_model_len: int | None = None,
+    encoder_config: Any | None = None,
 ) -> int:
     """Get and verify the model's maximum length."""
     derived_max_model_len = float("inf")
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index fc8d2262dcb40..5c253807915a4 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -4,7 +4,7 @@
 import hashlib
 from collections.abc import Mapping
 from dataclasses import field
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal, TypeAlias
 
 from pydantic import ConfigDict, Field, field_validator
 from pydantic.dataclasses import dataclass
@@ -23,31 +23,31 @@ class BaseDummyOptions:
 class VideoDummyOptions(BaseDummyOptions):
     """Options for generating dummy video data during profiling."""
 
-    num_frames: Optional[int] = Field(None, gt=0)
-    width: Optional[int] = Field(None, gt=0)
-    height: Optional[int] = Field(None, gt=0)
+    num_frames: int | None = Field(None, gt=0)
+    width: int | None = Field(None, gt=0)
+    height: int | None = Field(None, gt=0)
 
 
 @dataclass(config=ConfigDict(extra="forbid"))
 class ImageDummyOptions(BaseDummyOptions):
     """Options for generating dummy image data during profiling."""
 
-    width: Optional[int] = Field(None, gt=0)
-    height: Optional[int] = Field(None, gt=0)
+    width: int | None = Field(None, gt=0)
+    height: int | None = Field(None, gt=0)
 
 
 @dataclass(config=ConfigDict(extra="forbid"))
 class AudioDummyOptions(BaseDummyOptions):
     """Options for generating dummy audio data during profiling."""
 
-    length: Optional[int] = Field(None, gt=0)
+    length: int | None = Field(None, gt=0)
 
 
 MMEncoderTPMode = Literal["weights", "data"]
 MMCacheType = Literal["shm", "lru"]
-DummyOptions = Union[
-    BaseDummyOptions, VideoDummyOptions, ImageDummyOptions, AudioDummyOptions
-]
+DummyOptions: TypeAlias = (
+    BaseDummyOptions | VideoDummyOptions | ImageDummyOptions | AudioDummyOptions
+)
 
 
 @config
@@ -75,7 +75,7 @@ class MultiModalConfig:
     """Additional args passed to process media inputs, keyed by modalities.
     For example, to set num_frames for video, set
     `--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
-    mm_processor_kwargs: Optional[dict[str, object]] = None
+    mm_processor_kwargs: dict[str, object] | None = None
     """Arguments to be forwarded to the model's processor for multi-modal data,
     e.g., image processor. Overrides for the multi-modal processor obtained
     from `transformers.AutoProcessor.from_pretrained`.
@@ -123,7 +123,7 @@ class MultiModalConfig:
     This reduces engine startup time but shifts the responsibility to users for
     estimating the peak memory usage of the activation of multimodal encoder and
     embedding cache."""
-    video_pruning_rate: Optional[float] = None
+    video_pruning_rate: float | None = None
     """Sets pruning rate for video pruning via Efficient Video Sampling.
     Value sits in range [0;1) and determines fraction of media tokens
     from each video to be pruned.
@@ -132,7 +132,7 @@ class MultiModalConfig:
     @field_validator("limit_per_prompt", mode="before")
     @classmethod
     def _validate_limit_per_prompt(
-        cls, value: dict[str, Union[int, dict[str, int]]]
+        cls, value: dict[str, int | dict[str, int]]
     ) -> dict[str, DummyOptions]:
         for k, v in value.items():
             # Handle legacy format where only count is specified
@@ -179,7 +179,7 @@ class MultiModalConfig:
             return 999
         return limit_data.count
 
-    def get_dummy_options(self, modality: str) -> Optional[BaseDummyOptions]:
+    def get_dummy_options(self, modality: str) -> BaseDummyOptions | None:
         """
         Get the configurable dummy data options for a modality.
         Returns None if no options are configured for this modality.
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index 6c7b5fbbee477..592246c1c35f8 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -3,7 +3,7 @@
 
 import hashlib
 from functools import cached_property
-from typing import Any, Literal, Optional, cast
+from typing import Any, Literal, cast
 
 from pydantic.dataclasses import dataclass
 
@@ -18,7 +18,7 @@ DetailedTraceModules = Literal["model", "worker", "all"]
 class ObservabilityConfig:
     """Configuration for observability - metrics and tracing."""
 
-    show_hidden_metrics_for_version: Optional[str] = None
+    show_hidden_metrics_for_version: str | None = None
     """Enable deprecated Prometheus metrics that have been hidden since the
     specified version. For example, if a previously deprecated metric has been
     hidden since the v0.7.0 release, you use
@@ -33,10 +33,10 @@ class ObservabilityConfig:
             return False
         return version._prev_minor_version_was(self.show_hidden_metrics_for_version)
 
-    otlp_traces_endpoint: Optional[str] = None
+    otlp_traces_endpoint: str | None = None
     """Target URL to which OpenTelemetry traces will be sent."""
 
-    collect_detailed_traces: Optional[list[DetailedTraceModules]] = None
+    collect_detailed_traces: list[DetailedTraceModules] | None = None
     """It makes sense to set this only if `--otlp-traces-endpoint` is set. If
     set, it will collect detailed traces for the specified modules. This
     involves use of possibly costly and or blocking operations and hence might
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 88bee9e2d42ee..084e458f88309 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -3,7 +3,7 @@
 
 import hashlib
 import os
-from typing import TYPE_CHECKING, Any, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, Literal
 
 import torch
 from pydantic import Field, model_validator
@@ -75,7 +75,7 @@ class ParallelConfig:
     """Number of local data parallel groups."""
     data_parallel_rank: int = 0
     """Rank of the data parallel group."""
-    data_parallel_rank_local: Optional[int] = None
+    data_parallel_rank_local: int | None = None
     """Local rank of the data parallel group,
     set only in SPMD mode."""
     data_parallel_master_ip: str = "127.0.0.1"
@@ -113,24 +113,24 @@ class ParallelConfig:
       with 4 experts and 2 ranks, rank 0 will have experts [0, 2] and rank 1
       will have experts [1, 3]. This strategy can help improve load balancing
       for grouped expert models with no redundant experts."""
-    num_redundant_experts: Optional[int] = None
+    num_redundant_experts: int | None = None
     """`num_redundant_experts` is deprecated and has been replaced with
     `eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
     Please use `eplb_config.num_redundant_experts` instead."""
-    eplb_window_size: Optional[int] = None
+    eplb_window_size: int | None = None
     """`eplb_window_size` is deprecated and has been replaced with
     `eplb_config.window_size`. This will be removed in v0.12.0.
     Please use `eplb_config.window_size` instead."""
-    eplb_step_interval: Optional[int] = None
+    eplb_step_interval: int | None = None
     """`eplb_step_interval` is deprecated and has been replaced with
     `eplb_config.step_interval`. This will be removed in v0.12.0.
     Please use `eplb_config.step_interval` instead."""
-    eplb_log_balancedness: Optional[bool] = None
+    eplb_log_balancedness: bool | None = None
     """`eplb_log_balancedness` is deprecated and has been replaced with
     `eplb_config.log_balancedness`. This will be removed in v0.12.0.
     Please use `eplb_config.log_balancedness` instead."""
 
-    max_parallel_loading_workers: Optional[int] = None
+    max_parallel_loading_workers: int | None = None
     """Maximum number of parallel loading workers when loading model
     sequentially in multiple batches. To avoid RAM OOM when using tensor
     parallel and large models."""
@@ -159,15 +159,15 @@ class ParallelConfig:
     ray_workers_use_nsight: bool = False
     """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
 
-    ray_runtime_env: Optional[RuntimeEnv] = None
+    ray_runtime_env: RuntimeEnv | None = None
     """Ray runtime environment to pass to distributed workers."""
 
-    placement_group: Optional[PlacementGroup] = None
+    placement_group: PlacementGroup | None = None
     """ray distributed model workers placement group."""
 
-    distributed_executor_backend: Optional[
-        Union[str, DistributedExecutorBackend, type[ExecutorBase]]
-    ] = None
+    distributed_executor_backend: (
+        str | DistributedExecutorBackend | type[ExecutorBase] | None
+    ) = None
     """Backend to use for distributed model
     workers, either "ray" or "mp" (multiprocessing). If the product
     of pipeline_parallel_size and tensor_parallel_size is less than
@@ -306,7 +306,7 @@ class ParallelConfig:
         )
 
         max_retries = 5
-        last_exc: Optional[Exception] = None
+        last_exc: Exception | None = None
         for _ in range(max_retries):
             try:
                 # use gloo since the engine process might not have cuda device
diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py
index 8b10992faa022..e40fc6a9bb20c 100644
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import hashlib
-from typing import Any, Optional
+from typing import Any
 
 from pydantic.dataclasses import dataclass
 
@@ -14,23 +14,23 @@ from vllm.config.utils import config
 class PoolerConfig:
     """Controls the behavior of output pooling in pooling models."""
 
-    pooling_type: Optional[str] = None
+    pooling_type: str | None = None
     """
     The pooling method of the pooling model. This should be a key in
     [`vllm.model_executor.layers.pooler.PoolingType`][].
     """
 
     ## for embeddings models
-    normalize: Optional[bool] = None
+    normalize: bool | None = None
     """
     Whether to normalize the embeddings outputs. Defaults to True.
     """
-    dimensions: Optional[int] = None
+    dimensions: int | None = None
     """
     Reduce the dimensions of embeddings if model
     support matryoshka representation. Defaults to None.
     """
-    enable_chunked_processing: Optional[bool] = None
+    enable_chunked_processing: bool | None = None
     """
     Whether to enable chunked processing for long inputs that exceed the model's
     maximum position embeddings. When enabled, long inputs will be split into
@@ -38,7 +38,7 @@ class PoolerConfig:
     This allows embedding models to handle arbitrarily long text without CUDA
     errors. Defaults to False.
     """
-    max_embed_len: Optional[int] = None
+    max_embed_len: int | None = None
     """
     Maximum input length allowed for embedding generation. When set, allows
     inputs longer than max_embed_len to be accepted for embedding models.
@@ -48,29 +48,29 @@ class PoolerConfig:
     """
 
     ## for classification models
-    activation: Optional[bool] = None
+    activation: bool | None = None
     """
     Whether to apply activation function to the classification outputs.
     Defaults to True.
     """
-    logit_bias: Optional[float] = None
+    logit_bias: float | None = None
     """
     If provided, apply classification logit biases. Defaults to None.
     """
 
     ## for reward models
-    softmax: Optional[bool] = None
+    softmax: bool | None = None
     """
     Whether to apply softmax to the reward outputs.
     Defaults to True.
     """
-    step_tag_id: Optional[int] = None
+    step_tag_id: int | None = None
     """
     If set, only the score corresponding to the ``step_tag_id`` in the
     generated sentence should be returned. Otherwise, the scores for all tokens
     are returned.
     """
-    returned_token_ids: Optional[list[int]] = None
+    returned_token_ids: list[int] | None = None
     """
     A list of indices for the vocabulary dimensions to be extracted,
     such as the token IDs of ``good_token`` and ``bad_token`` in the
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 396258aac287b..061aa4d4a4f5b 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -3,7 +3,7 @@
 
 import hashlib
 from dataclasses import InitVar, field
-from typing import Any, Literal, Union
+from typing import Any, Literal
 
 from pydantic import SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
@@ -133,7 +133,7 @@ class SchedulerConfig:
 
     # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
     # or "mod.custom_class".
-    scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
+    scheduler_cls: str | type[object] = "vllm.core.scheduler.Scheduler"
     """The scheduler class to use. "vllm.core.scheduler.Scheduler" is the
     default scheduler. Can be a class directly or the path to a class of form
     "mod.custom_class"."""
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index aa0c07cf62a36..aa254a9b35f65 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -3,7 +3,7 @@
 
 import ast
 import hashlib
-from typing import TYPE_CHECKING, Any, Literal, Optional
+from typing import TYPE_CHECKING, Any, Literal
 
 from pydantic import SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
@@ -59,16 +59,16 @@ MTP_MODEL_TYPES = (
 class SpeculativeConfig:
     """Configuration for speculative decoding."""
 
-    enforce_eager: Optional[bool] = None
+    enforce_eager: bool | None = None
     """Override the default enforce_eager from model_config"""
     # General speculative decoding control
     num_speculative_tokens: SkipValidation[int] = None  # type: ignore
     """The number of speculative tokens, if provided. It will default to the
     number in the draft model config if present, otherwise, it is required."""
-    model: Optional[str] = None
+    model: str | None = None
     """The name of the draft model, eagle head, or additional weights, if
     provided."""
-    method: Optional[SpeculativeMethod] = None
+    method: SpeculativeMethod | None = None
     """The name of the speculative method to use. If users provide and set the
     `model` param, the speculative method type will be detected automatically
     if possible, if `model` param is not provided, the method name must be
@@ -76,7 +76,7 @@ class SpeculativeConfig:
 
     If using `ngram` method, the related configuration `prompt_lookup_max` and
     `prompt_lookup_min` should be considered."""
-    draft_tensor_parallel_size: Optional[int] = None
+    draft_tensor_parallel_size: int | None = None
     """The degree of the tensor parallelism for the draft model. Can only be 1
     or the same as the target model's tensor parallel size."""
     disable_logprobs: bool = True
@@ -85,24 +85,24 @@ class SpeculativeConfig:
     according to the log probability settings in SamplingParams."""
 
     # Draft model configuration
-    quantization: Optional[me_quant.QuantizationMethods] = None
+    quantization: me_quant.QuantizationMethods | None = None
     """Quantization method that was used to quantize the draft model weights.
     If `None`, we assume the model weights are not quantized. Note that it only
     takes effect when using the draft model-based speculative method."""
-    max_model_len: Optional[int] = None
+    max_model_len: int | None = None
     """The maximum model length of the draft model. Used when testing the
     ability to skip speculation for some sequences."""
-    revision: Optional[str] = None
+    revision: str | None = None
     """The specific model version to use for the draft model. It can be a
     branch name, a tag name, or a commit id. If unspecified, will use the
     default version."""
-    code_revision: Optional[str] = None
+    code_revision: str | None = None
     """The specific revision to use for the draft model code on Hugging Face
     Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
     will use the default version."""
 
     # Advanced control
-    disable_by_batch_size: Optional[int] = None
+    disable_by_batch_size: int | None = None
     """Disable speculative decoding for new incoming requests when the number
     of enqueued requests is larger than this value, if provided."""
     disable_padded_drafter_batch: bool = False
@@ -112,14 +112,14 @@ class SpeculativeConfig:
     only affects the EAGLE method of speculation."""
 
     # Ngram proposer configuration
-    prompt_lookup_max: Optional[int] = None
+    prompt_lookup_max: int | None = None
     """Maximum size of ngram token window when using Ngram proposer, required
     when method is set to ngram."""
-    prompt_lookup_min: Optional[int] = None
+    prompt_lookup_min: int | None = None
     """Minimum size of ngram token window when using Ngram proposer, if
     provided. Defaults to 1."""
 
-    speculative_token_tree: Optional[str] = None
+    speculative_token_tree: str | None = None
     """Specifies the tree structure for speculative token generation.
     """
     # required configuration params passed from engine
@@ -449,7 +449,7 @@ class SpeculativeConfig:
 
     @staticmethod
     def _maybe_override_draft_max_model_len(
-        speculative_max_model_len: Optional[int],
+        speculative_max_model_len: int | None,
         draft_max_model_len: int,
         target_max_model_len: int,
     ) -> int:
@@ -488,7 +488,7 @@ class SpeculativeConfig:
     @staticmethod
     def _verify_and_get_draft_tp(
         target_parallel_config: ParallelConfig,
-        speculative_draft_tensor_parallel_size: Optional[int],
+        speculative_draft_tensor_parallel_size: int | None,
         draft_hf_config: PretrainedConfig,
     ) -> int:
         """
diff --git a/vllm/config/speech_to_text.py b/vllm/config/speech_to_text.py
index de9f525efe185..3eafff1a30609 100644
--- a/vllm/config/speech_to_text.py
+++ b/vllm/config/speech_to_text.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 from pydantic.dataclasses import dataclass
 
@@ -28,7 +27,7 @@ class SpeechToTextConfig:
     splitting long audio. This helps maintain context across chunk boundaries
     and improves transcription quality at split points."""
 
-    min_energy_split_window_size: Optional[int] = 1600
+    min_energy_split_window_size: int | None = 1600
     """Window size in samples for finding low-energy (quiet) regions to split
     audio chunks. The algorithm looks for the quietest moment within this
     window to minimize cutting through speech. Default 1600 samples ≈ 100ms
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 9d156dd8d9de3..b15d122c9161a 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -10,7 +10,7 @@ from contextlib import contextmanager
 from dataclasses import field, replace
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, TypeVar
 
 import torch
 from pydantic import ConfigDict
@@ -69,17 +69,17 @@ class VllmConfig:
     """Device configuration."""
     load_config: LoadConfig = field(default_factory=LoadConfig)
     """Load configuration."""
-    lora_config: Optional[LoRAConfig] = None
+    lora_config: LoRAConfig | None = None
     """LoRA configuration."""
-    speculative_config: Optional[SpeculativeConfig] = None
+    speculative_config: SpeculativeConfig | None = None
     """Speculative decoding configuration."""
     structured_outputs_config: StructuredOutputsConfig = field(
         default_factory=StructuredOutputsConfig
     )
     """Structured outputs configuration."""
-    observability_config: Optional[ObservabilityConfig] = None
+    observability_config: ObservabilityConfig | None = None
     """Observability configuration."""
-    quant_config: Optional[QuantizationConfig] = None
+    quant_config: QuantizationConfig | None = None
     """Quantization configuration."""
     compilation_config: CompilationConfig = field(default_factory=CompilationConfig)
     """`torch.compile` and cudagraph capture configuration for the model.
@@ -96,14 +96,14 @@ class VllmConfig:
     You can specify the full compilation config like so:
     `{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
     """
-    kv_transfer_config: Optional[KVTransferConfig] = None
+    kv_transfer_config: KVTransferConfig | None = None
     """The configurations for distributed KV cache transfer."""
-    kv_events_config: Optional[KVEventsConfig] = None
+    kv_events_config: KVEventsConfig | None = None
     """The configurations for event publishing."""
     # some opaque config, only used to provide additional information
     # for the hash computation, mainly used for testing, debugging or out of
     # tree config registration.
-    additional_config: Union[dict, SupportsHash] = field(default_factory=dict)
+    additional_config: dict | SupportsHash = field(default_factory=dict)
     """Additional config for specified platform. Different platforms may
     support different configs. Make sure the configs are valid for the platform
     you are using. Contents must be hashable."""
@@ -212,7 +212,7 @@ class VllmConfig:
     @staticmethod
     def _get_quantization_config(
         model_config: ModelConfig, load_config: LoadConfig
-    ) -> Optional[QuantizationConfig]:
+    ) -> QuantizationConfig | None:
         """Get the quantization config."""
         from vllm.platforms import current_platform
 
@@ -245,7 +245,7 @@ class VllmConfig:
     @staticmethod
     def get_quantization_config(
         model_config: ModelConfig, load_config: LoadConfig
-    ) -> Optional[QuantizationConfig]:
+    ) -> QuantizationConfig | None:
         import copy
 
         # For some reason, the _ version of this modifies the model_config
@@ -257,7 +257,7 @@ class VllmConfig:
     def with_hf_config(
         self,
         hf_config: PretrainedConfig,
-        architectures: Optional[list[str]] = None,
+        architectures: list[str] | None = None,
     ) -> "VllmConfig":
         if architectures is not None:
             hf_config = copy.deepcopy(hf_config)
@@ -740,7 +740,7 @@ class VllmConfig:
                     f"Model: {self.model_config.model}"
                 )
 
-    def compile_debug_dump_path(self) -> Optional[Path]:
+    def compile_debug_dump_path(self) -> Path | None:
         """Returns a rank-aware path for dumping
         torch.compile debug information.
         """
@@ -790,13 +790,13 @@ class VllmConfig:
         )
 
 
-_current_vllm_config: Optional[VllmConfig] = None
-_current_prefix: Optional[str] = None
+_current_vllm_config: VllmConfig | None = None
+_current_prefix: str | None = None
 
 
 @contextmanager
 def set_current_vllm_config(
-    vllm_config: VllmConfig, check_compile=False, prefix: Optional[str] = None
+    vllm_config: VllmConfig, check_compile=False, prefix: str | None = None
 ):
     """
     Temporarily set the current vLLM config.
@@ -866,7 +866,7 @@ T = TypeVar("T")
 def get_layers_from_vllm_config(
     vllm_config: VllmConfig,
     layer_type: type[T],
-    layer_names: Optional[list[str]] = None,
+    layer_names: list[str] | None = None,
 ) -> dict[str, T]:
     """
     Get layers from the vLLM config.
diff --git a/vllm/connections.py b/vllm/connections.py
index 8d5e0e5cbf5d0..31b0d5e9c702f 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -3,7 +3,6 @@
 
 from collections.abc import Mapping, MutableMapping
 from pathlib import Path
-from typing import Optional
 from urllib.parse import urlparse
 
 import aiohttp
@@ -20,8 +19,8 @@ class HTTPConnection:
 
         self.reuse_client = reuse_client
 
-        self._sync_client: Optional[requests.Session] = None
-        self._async_client: Optional[aiohttp.ClientSession] = None
+        self._sync_client: requests.Session | None = None
+        self._async_client: aiohttp.ClientSession | None = None
 
     def get_sync_client(self) -> requests.Session:
         if self._sync_client is None or not self.reuse_client:
@@ -53,8 +52,8 @@ class HTTPConnection:
         url: str,
         *,
         stream: bool = False,
-        timeout: Optional[float] = None,
-        extra_headers: Optional[Mapping[str, str]] = None,
+        timeout: float | None = None,
+        extra_headers: Mapping[str, str] | None = None,
         allow_redirects: bool = True,
     ):
         self._validate_http_url(url)
@@ -74,8 +73,8 @@ class HTTPConnection:
         self,
         url: str,
         *,
-        timeout: Optional[float] = None,
-        extra_headers: Optional[Mapping[str, str]] = None,
+        timeout: float | None = None,
+        extra_headers: Mapping[str, str] | None = None,
         allow_redirects: bool = True,
     ):
         self._validate_http_url(url)
@@ -91,7 +90,7 @@ class HTTPConnection:
         )
 
     def get_bytes(
-        self, url: str, *, timeout: Optional[float] = None, allow_redirects: bool = True
+        self, url: str, *, timeout: float | None = None, allow_redirects: bool = True
     ) -> bytes:
         with self.get_response(
             url, timeout=timeout, allow_redirects=allow_redirects
@@ -104,7 +103,7 @@ class HTTPConnection:
         self,
         url: str,
         *,
-        timeout: Optional[float] = None,
+        timeout: float | None = None,
         allow_redirects: bool = True,
     ) -> bytes:
         async with await self.get_async_response(
@@ -114,7 +113,7 @@ class HTTPConnection:
 
             return await r.read()
 
-    def get_text(self, url: str, *, timeout: Optional[float] = None) -> str:
+    def get_text(self, url: str, *, timeout: float | None = None) -> str:
         with self.get_response(url, timeout=timeout) as r:
             r.raise_for_status()
 
@@ -124,14 +123,14 @@ class HTTPConnection:
         self,
         url: str,
         *,
-        timeout: Optional[float] = None,
+        timeout: float | None = None,
     ) -> str:
         async with await self.get_async_response(url, timeout=timeout) as r:
             r.raise_for_status()
 
             return await r.text()
 
-    def get_json(self, url: str, *, timeout: Optional[float] = None) -> str:
+    def get_json(self, url: str, *, timeout: float | None = None) -> str:
         with self.get_response(url, timeout=timeout) as r:
             r.raise_for_status()
 
@@ -141,7 +140,7 @@ class HTTPConnection:
         self,
         url: str,
         *,
-        timeout: Optional[float] = None,
+        timeout: float | None = None,
     ) -> str:
         async with await self.get_async_response(url, timeout=timeout) as r:
             r.raise_for_status()
@@ -153,7 +152,7 @@ class HTTPConnection:
         url: str,
         save_path: Path,
         *,
-        timeout: Optional[float] = None,
+        timeout: float | None = None,
         chunk_size: int = 128,
     ) -> Path:
         with self.get_response(url, timeout=timeout) as r:
@@ -170,7 +169,7 @@ class HTTPConnection:
         url: str,
         save_path: Path,
         *,
-        timeout: Optional[float] = None,
+        timeout: float | None = None,
         chunk_size: int = 128,
     ) -> Path:
         async with await self.get_async_response(url, timeout=timeout) as r:
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 97c6654385b35..2586927864ab9 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -11,8 +11,9 @@
 import dataclasses
 import gc
 import os
+from collections.abc import Callable
 from contextlib import contextmanager
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import torch
 
@@ -22,7 +23,7 @@ from vllm.utils import is_pin_memory_available
 logger = init_logger(__name__)
 
 
-def find_loaded_library(lib_name) -> Optional[str]:
+def find_loaded_library(lib_name) -> str | None:
     """
     According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
     the file `/proc/self/maps` contains the memory maps of the process, which includes the
@@ -78,7 +79,7 @@ HandleType = tuple[int, int, int, int]
 class AllocationData:
     handle: HandleType
     tag: str
-    cpu_backup_tensor: Optional[torch.Tensor] = None
+    cpu_backup_tensor: torch.Tensor | None = None
 
 
 def create_and_map(allocation_handle: HandleType) -> None:
@@ -197,7 +198,7 @@ class CuMemAllocator:
         )
         return data.handle
 
-    def sleep(self, offload_tags: Optional[Union[tuple[str, ...], str]] = None) -> None:
+    def sleep(self, offload_tags: tuple[str, ...] | str | None = None) -> None:
         """
         Put the allocator in sleep mode.
         All data in the memory allocation with the specified tag will be
@@ -247,7 +248,7 @@ class CuMemAllocator:
         gc.collect()
         torch.cuda.empty_cache()
 
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    def wake_up(self, tags: list[str] | None = None) -> None:
         """
         Wake up the allocator from sleep mode.
         All data that is previously offloaded will be loaded back to GPU
@@ -272,7 +273,7 @@ class CuMemAllocator:
                         data.cpu_backup_tensor = None
 
     @contextmanager
-    def use_memory_pool(self, tag: Optional[str] = None):
+    def use_memory_pool(self, tag: str | None = None):
         """
         A context manager to use the memory pool.
         All memory allocation created inside the context will be allocated
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index 46a735f22ed85..5ad99e4e1592d 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.distributed
@@ -30,13 +30,13 @@ def tensor_model_parallel_reduce_scatter(
 
 def tensor_model_parallel_gather(
     input_: torch.Tensor, dst: int = 0, dim: int = -1
-) -> Optional[torch.Tensor]:
+) -> torch.Tensor | None:
     """Gather the input tensor across model parallel group."""
     return get_tp_group().gather(input_, dst, dim)
 
 
 def broadcast_tensor_dict(
-    tensor_dict: Optional[dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0
+    tensor_dict: dict[Any, torch.Tensor | Any] | None = None, src: int = 0
 ):
     if not torch.distributed.is_initialized():
         return tensor_dict
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index a67405f44206a..48673202c6cce 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -368,7 +368,7 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
         return handle
 
     # DeepEP LL uses RDMA so no SMs are used for communication
-    def max_sms_used(self) -> Optional[int]:
+    def max_sms_used(self) -> int | None:
         return 0
 
 
diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py
index dabb48320be45..9e99fd01a9197 100644
--- a/vllm/distributed/device_communicators/all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/all_reduce_utils.py
@@ -10,7 +10,7 @@ import sys
 import tempfile
 from collections.abc import Sequence
 from itertools import product
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -86,7 +86,7 @@ def producer(
     producer_queue,
     consumer_queue,
     result_queue,
-    cuda_visible_devices: Optional[str] = None,
+    cuda_visible_devices: str | None = None,
 ):
     if cuda_visible_devices is not None:
         update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
@@ -120,7 +120,7 @@ def consumer(
     producer_queue,
     consumer_queue,
     result_queue,
-    cuda_visible_devices: Optional[str] = None,
+    cuda_visible_devices: str | None = None,
 ):
     if cuda_visible_devices is not None:
         update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
@@ -253,7 +253,7 @@ def can_actually_p2p(
 #  e.g. used by different vllm engines. The device id in the cache file is a
 #  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
 #  of visible devices in the vllm engine.
-_gpu_p2p_access_cache: Optional[dict[str, bool]] = None
+_gpu_p2p_access_cache: dict[str, bool] | None = None
 
 
 def gpu_p2p_access_check(src: int, tgt: int) -> bool:
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index c32be0bec55c0..007c65acedb9b 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
-from typing import Optional, Union
 from weakref import WeakValueDictionary
 
 import torch
@@ -75,7 +74,7 @@ class All2AllManagerBase:
     def set_num_sms(self, num_sms: int):
         pass
 
-    def max_sms_used(self) -> Optional[int]:
+    def max_sms_used(self) -> int | None:
         return None  # None means it could use the whole GPU
 
     def combine(self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False):
@@ -96,8 +95,8 @@ class DeviceCommunicatorBase:
     def __init__(
         self,
         cpu_group: ProcessGroup,
-        device: Optional[torch.device] = None,
-        device_group: Optional[ProcessGroup] = None,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
         unique_name: str = "",
     ):
         self.device = device or torch.device("cpu")
@@ -123,7 +122,7 @@ class DeviceCommunicatorBase:
 
         self.is_ep_communicator = "ep" in unique_name
         self.use_all2all = self.is_ep_communicator and use_ep
-        self.all2all_manager: Optional[All2AllManagerBase] = None
+        self.all2all_manager: All2AllManagerBase | None = None
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         dist.all_reduce(input_, group=self.device_group)
@@ -156,10 +155,10 @@ class DeviceCommunicatorBase:
 
     def all_gatherv(
         self,
-        input_: Union[torch.Tensor, list[torch.Tensor]],
+        input_: torch.Tensor | list[torch.Tensor],
         dim: int = 0,
-        sizes: Optional[list[int]] = None,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        sizes: list[int] | None = None,
+    ) -> torch.Tensor | list[torch.Tensor]:
         raise NotImplementedError
 
     def reduce_scatter(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
@@ -196,13 +195,13 @@ class DeviceCommunicatorBase:
         return output_tensor.movedim(0, dim).contiguous()
 
     def reduce_scatterv(
-        self, input_: torch.Tensor, dim: int = -1, sizes: Optional[list[int]] = None
+        self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None
     ) -> torch.Tensor:
         raise NotImplementedError
 
     def gather(
         self, input_: torch.Tensor, dst: int = 0, dim: int = -1
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         NOTE: We assume that the input tensor is on the same device across
         all the ranks.
@@ -231,7 +230,7 @@ class DeviceCommunicatorBase:
             output_tensor = None
         return output_tensor
 
-    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+    def send(self, tensor: torch.Tensor, dst: int | None = None) -> None:
         """Sends a tensor to the destination rank in a blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
         if dst is None:
@@ -239,7 +238,7 @@ class DeviceCommunicatorBase:
         torch.distributed.send(tensor, self.ranks[dst], self.device_group)
 
     def recv(
-        self, size: torch.Size, dtype: torch.dtype, src: Optional[int] = None
+        self, size: torch.Size, dtype: torch.dtype, src: int | None = None
     ) -> torch.Tensor:
         """Receives a tensor from the source rank."""
         """NOTE: `src` is the local rank of the source rank."""
diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py
index c09b3ba9ceba6..fdfb74d7a752c 100644
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch.distributed import ProcessGroup
@@ -18,8 +18,8 @@ class CpuCommunicator(DeviceCommunicatorBase):
     def __init__(
         self,
         cpu_group: ProcessGroup,
-        device: Optional[torch.device] = None,
-        device_group: Optional[ProcessGroup] = None,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
         unique_name: str = "",
     ):
         super().__init__(cpu_group, device, device_group, unique_name)
@@ -38,7 +38,7 @@ class CpuCommunicator(DeviceCommunicatorBase):
 
     def gather(
         self, input_: torch.Tensor, dst: int = 0, dim: int = -1
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         NOTE: We assume that the input tensor is on the same device across
         all the ranks.
@@ -99,7 +99,7 @@ class CpuCommunicator(DeviceCommunicatorBase):
 
     def send_tensor_dict(
         self,
-        tensor_dict: dict[str, Union[torch.Tensor, Any]],
+        tensor_dict: dict[str, torch.Tensor | Any],
         dst: int,
     ) -> None:
         return self.dist_module.send_tensor_dict(tensor_dict, dst)
@@ -107,7 +107,7 @@ class CpuCommunicator(DeviceCommunicatorBase):
     def recv_tensor_dict(
         self,
         src: int,
-    ) -> dict[str, Union[torch.Tensor, Any]]:
+    ) -> dict[str, torch.Tensor | Any]:
         return self.dist_module.recv_tensor_dict(src)
 
 
@@ -140,16 +140,16 @@ class _CPUSHMDistributed:
         return handle
 
     def all_reduce(
-        self, input: torch.Tensor, group: Optional[ProcessGroup] = None
+        self, input: torch.Tensor, group: ProcessGroup | None = None
     ) -> None:
         torch.ops._C.shm_allreduce(self.handle, input)
 
     def gather(
         self,
         input: torch.Tensor,
-        gather_list: Optional[list[torch.Tensor]],
+        gather_list: list[torch.Tensor] | None,
         dst: int = -1,
-        group: Optional[ProcessGroup] = None,
+        group: ProcessGroup | None = None,
     ) -> None:
         # Note: different from the torch gather, here we use local dst rank.
         torch.ops._C.shm_gather(
@@ -163,13 +163,13 @@ class _CPUSHMDistributed:
         self,
         output: torch.Tensor,
         input: torch.Tensor,
-        group: Optional[ProcessGroup] = None,
+        group: ProcessGroup | None = None,
     ) -> None:
         torch.ops._C.shm_all_gather(self.handle, input, output)
 
     def send_tensor_dict(
         self,
-        tensor_dict: dict[str, Union[torch.Tensor, Any]],
+        tensor_dict: dict[str, torch.Tensor | Any],
         dst: int,
     ) -> None:
         key_list = list(tensor_dict.keys())
@@ -191,7 +191,7 @@ class _CPUSHMDistributed:
     def recv_tensor_dict(
         self,
         src: int,
-    ) -> dict[str, Union[torch.Tensor, Any]]:
+    ) -> dict[str, torch.Tensor | Any]:
         tensor_list = torch.ops._C.shm_recv_tensor_list(self.handle, src)
 
         value_list: list[torch.Tensor] = tensor_list[:-1]
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 45096dffb5b63..c5c13debddb50 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import torch
 from torch.distributed import ProcessGroup
@@ -26,8 +25,8 @@ class CudaCommunicator(DeviceCommunicatorBase):
     def __init__(
         self,
         cpu_group: ProcessGroup,
-        device: Optional[torch.device] = None,
-        device_group: Optional[ProcessGroup] = None,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
         unique_name: str = "",
     ):
         super().__init__(cpu_group, device, device_group, unique_name)
@@ -54,7 +53,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
         )
         from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 
-        self.pynccl_comm: Optional[PyNcclCommunicator] = None
+        self.pynccl_comm: PyNcclCommunicator | None = None
         if self.world_size > 1:
             self.pynccl_comm = PyNcclCommunicator(
                 group=self.cpu_group,
@@ -63,9 +62,9 @@ class CudaCommunicator(DeviceCommunicatorBase):
             if is_symmetric_memory_enabled():
                 register_nccl_symmetric_ops(self.pynccl_comm)
 
-        self.ca_comm: Optional[CustomAllreduce] = None
-        self.qr_comm: Optional[QuickAllReduce] = None
-        self.symm_mem_comm: Optional[SymmMemCommunicator] = None
+        self.ca_comm: CustomAllreduce | None = None
+        self.qr_comm: QuickAllReduce | None = None
+        self.symm_mem_comm: SymmMemCommunicator | None = None
         if use_torch_symm_mem and current_platform.is_cuda():
             self.symm_mem_comm = SymmMemCommunicator(
                 group=self.cpu_group,
@@ -201,7 +200,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
         return output.movedim(0, dim).contiguous()
 
     def reduce_scatterv(
-        self, input_: torch.Tensor, dim: int = -1, sizes: Optional[list[int]] = None
+        self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None
     ):
         world_size = self.world_size
         pynccl_comm = self.pynccl_comm
@@ -235,7 +234,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
         # Reshape before returning
         return output.movedim(0, dim).contiguous()
 
-    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+    def send(self, tensor: torch.Tensor, dst: int | None = None) -> None:
         """Sends a tensor to the destination rank in a blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
         if dst is None:
@@ -248,7 +247,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
             torch.distributed.send(tensor, self.ranks[dst], self.device_group)
 
     def recv(
-        self, size: torch.Size, dtype: torch.dtype, src: Optional[int] = None
+        self, size: torch.Size, dtype: torch.dtype, src: int | None = None
     ) -> torch.Tensor:
         """Receives a tensor from the source rank."""
         """NOTE: `src` is the local rank of the source rank."""
@@ -274,9 +273,9 @@ class CudaCommunicator(DeviceCommunicatorBase):
 
     def all_gatherv(
         self,
-        input_: Union[torch.Tensor, list[torch.Tensor]],
+        input_: torch.Tensor | list[torch.Tensor],
         dim: int = 0,
-        sizes: Optional[list[int]] = None,
+        sizes: list[int] | None = None,
     ):
         if dim != 0:
             raise NotImplementedError("only dim 0 all-gatherv is supported")
@@ -289,7 +288,7 @@ class CudaCommunicator(DeviceCommunicatorBase):
         if sizes is not None and all(s == sizes[0] for s in sizes):
             sizes = None
 
-        def _all_gather_single(input_: torch.Tensor, sizes: Optional[list[int]] = None):
+        def _all_gather_single(input_: torch.Tensor, sizes: list[int] | None = None):
             input_size = input_.size()
             if sizes is not None:
                 assert len(sizes) == world_size
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index a77d2666e2ce3..07ab2f7124091 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -7,7 +7,7 @@ convenient for use when we just need to call a few functions.
 
 import ctypes
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 
 # this line makes it possible to directly load `libcudart.so` using `ctypes`
 import torch  # noqa
@@ -36,7 +36,7 @@ class Function:
     argtypes: list[Any]
 
 
-def find_loaded_library(lib_name) -> Optional[str]:
+def find_loaded_library(lib_name) -> str | None:
     """
     According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
     the file `/proc/self/maps` contains the memory maps of the process, which includes the
@@ -113,7 +113,7 @@ class CudaRTLibrary:
     #  to the corresponding dictionary
     path_to_dict_mapping: dict[str, dict[str, Any]] = {}
 
-    def __init__(self, so_file: Optional[str] = None):
+    def __init__(self, so_file: str | None = None):
         if so_file is None:
             so_file = find_loaded_library("libcudart")
             if so_file is None:
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index fd5c5dfd9da0e..171e93ba53ee5 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
-from typing import Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -55,7 +54,7 @@ class CustomAllreduce:
     def __init__(
         self,
         group: ProcessGroup,
-        device: Union[int, str, torch.device],
+        device: int | str | torch.device,
         max_size=8192 * 1024,
         symm_mem_enabled=False,
     ) -> None:
@@ -260,7 +259,7 @@ class CustomAllreduce:
             )
         return out
 
-    def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
+    def custom_all_reduce(self, input: torch.Tensor) -> torch.Tensor | None:
         """The main allreduce API that provides support for cuda graph."""
         # When custom allreduce is disabled, this will be None.
         if self.disabled or not self.should_custom_ar(input):
@@ -292,8 +291,8 @@ class CustomAllreduce:
     @staticmethod
     def create_shared_buffer(
         size_in_bytes: int,
-        group: Optional[ProcessGroup] = None,
-        uncached: Optional[bool] = False,
+        group: ProcessGroup | None = None,
+        uncached: bool | None = False,
     ) -> list[int]:
         pointer, handle = ops.allocate_shared_buffer_and_handle(size_in_bytes)
 
@@ -313,8 +312,8 @@ class CustomAllreduce:
     @staticmethod
     def free_shared_buffer(
         pointers: list[int],
-        group: Optional[ProcessGroup] = None,
-        rank: Optional[int] = None,
+        group: ProcessGroup | None = None,
+        rank: int | None = None,
     ) -> None:
         if rank is None:
             rank = dist.get_rank(group=group)
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 59fa3f9c449b0..9b293d584a0a2 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 # ===================== import region =====================
 import torch
@@ -59,9 +58,9 @@ def register_nccl_symmetric_ops(pynccl_comm):
 class PyNcclCommunicator:
     def __init__(
         self,
-        group: Union[ProcessGroup, StatelessProcessGroup],
-        device: Union[int, str, torch.device],
-        library_path: Optional[str] = None,
+        group: ProcessGroup | StatelessProcessGroup,
+        device: int | str | torch.device,
+        library_path: str | None = None,
     ):
         """
         Args:
diff --git a/vllm/distributed/device_communicators/pynccl_allocator.py b/vllm/distributed/device_communicators/pynccl_allocator.py
index 3fe4fd744d77a..a2ed3628f4617 100644
--- a/vllm/distributed/device_communicators/pynccl_allocator.py
+++ b/vllm/distributed/device_communicators/pynccl_allocator.py
@@ -3,7 +3,7 @@
 import atexit
 import contextlib
 import tempfile
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from packaging import version
@@ -141,7 +141,7 @@ class nccl_symm_mem_context:
             or version.parse(torch.__version__) < version.parse("2.8.0.a0")
         )
         if self.disabled:
-            self.pynccl_comm: Optional[PyNcclCommunicator] = None
+            self.pynccl_comm: PyNcclCommunicator | None = None
             self._mem_pool_ctx: contextlib.AbstractContextManager[Any] = (
                 contextlib.nullcontext()
             )
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index e4d7b0f8fb85a..28d4afde16035 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -25,7 +25,7 @@
 import ctypes
 import platform
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch.distributed import ReduceOp
@@ -305,7 +305,7 @@ class NCCLLibrary:
     #  to the corresponding dictionary
     path_to_dict_mapping: dict[str, dict[str, Any]] = {}
 
-    def __init__(self, so_file: Optional[str] = None):
+    def __init__(self, so_file: str | None = None):
         so_file = so_file or find_nccl_library()
 
         try:
diff --git a/vllm/distributed/device_communicators/quick_all_reduce.py b/vllm/distributed/device_communicators/quick_all_reduce.py
index 16b6b6c28ea3a..7a95749635268 100644
--- a/vllm/distributed/device_communicators/quick_all_reduce.py
+++ b/vllm/distributed/device_communicators/quick_all_reduce.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from enum import Enum
-from typing import Union
 
 import torch
 import torch.distributed as dist
@@ -58,9 +57,7 @@ class QuickAllReduce:
         (torch.bfloat16, 8): [16 * MB, 2048 * MB, 2048 * MB, 2048 * MB],
     }
 
-    def __init__(
-        self, group: ProcessGroup, device: Union[int, str, torch.device]
-    ) -> None:
+    def __init__(self, group: ProcessGroup, device: int | str | torch.device) -> None:
         """
         Custom allreduce provides non-destructive acceleration and is
         available for CUDA and ROCm MI300 series.
diff --git a/vllm/distributed/device_communicators/ray_communicator.py b/vllm/distributed/device_communicators/ray_communicator.py
index da79afc7ac145..732a40770f254 100644
--- a/vllm/distributed/device_communicators/ray_communicator.py
+++ b/vllm/distributed/device_communicators/ray_communicator.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import uuid
-from typing import Any, Optional
+from typing import Any
 
 import ray
 import torch
@@ -27,15 +27,15 @@ class RayPPCommunicator(Communicator):
     This class is not thread-safe.
     """
 
-    _comm: Optional[DeviceCommunicatorBase]
+    _comm: DeviceCommunicatorBase | None
 
     def __init__(
         self,
         world_size: int,
         comm_id: Any,
-        rank: Optional[int],
+        rank: int | None,
         actor_handles: list["ray.actor.ActorHandle"],
-        cuda_stream: Optional[torch.cuda.Stream],
+        cuda_stream: torch.cuda.Stream | None,
         use_communication_streams: bool = False,
     ):
         """
@@ -56,7 +56,7 @@ class RayPPCommunicator(Communicator):
                 This is not supported.
         """
         self._world_size = world_size
-        self._rank: Optional[int] = None
+        self._rank: int | None = None
         self._actor_handles = actor_handles
         if use_communication_streams:
             raise NotImplementedError("use_communication_streams is not supported")
@@ -143,7 +143,7 @@ class RayPPCommunicator(Communicator):
         else:
             raise ValueError(f"Actor {actor} not found in communicator group")
 
-    def get_self_rank(self) -> Optional[int]:
+    def get_self_rank(self) -> int | None:
         """
         Return this actor's rank.
         """
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 4cec601027284..cd201503bf17d 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -7,7 +7,7 @@ from contextlib import contextmanager
 from dataclasses import dataclass, field
 from multiprocessing import shared_memory
 from threading import Event
-from typing import Any, Optional, Union
+from typing import Any
 from unittest.mock import patch
 
 import torch
@@ -80,7 +80,7 @@ class ShmRingBuffer:
         n_reader: int,
         max_chunk_bytes: int,
         max_chunks: int,
-        name: Optional[str] = None,
+        name: str | None = None,
     ):
         """
         A shared memory ring buffer implementation for broadcast communication.
@@ -213,9 +213,9 @@ class ShmRingBuffer:
 class Handle:
     local_reader_ranks: list[int] = field(default_factory=list)
 
-    buffer_handle: Optional[tuple[int, int, int, str]] = None
-    local_subscribe_addr: Optional[str] = None
-    remote_subscribe_addr: Optional[str] = None
+    buffer_handle: tuple[int, int, int, str] | None = None
+    local_subscribe_addr: str | None = None
+    remote_subscribe_addr: str | None = None
     remote_addr_ipv6: bool = False
 
 
@@ -224,10 +224,10 @@ class MessageQueue:
         self,
         n_reader,  # number of all readers
         n_local_reader,  # number of local readers through shared memory
-        local_reader_ranks: Optional[list[int]] = None,
+        local_reader_ranks: list[int] | None = None,
         max_chunk_bytes: int = 1024 * 1024 * 10,
         max_chunks: int = 10,
-        connect_ip: Optional[str] = None,
+        connect_ip: str | None = None,
     ):
         if local_reader_ranks is None:
             local_reader_ranks = list(range(n_local_reader))
@@ -384,7 +384,7 @@ class MessageQueue:
             assert recv == b"READY"
 
     @contextmanager
-    def acquire_write(self, timeout: Optional[float] = None):
+    def acquire_write(self, timeout: float | None = None):
         assert self._is_writer, "Only writers can acquire write"
         start_time = time.monotonic()
         n_warning = 1
@@ -444,8 +444,8 @@ class MessageQueue:
     @contextmanager
     def acquire_read(
         self,
-        timeout: Optional[float] = None,
-        cancel: Optional[Event] = None,
+        timeout: float | None = None,
+        cancel: Event | None = None,
         indefinite: bool = False,
     ):
         assert self._is_local_reader, "Only readers can acquire read"
@@ -502,7 +502,7 @@ class MessageQueue:
                 self._read_spin_timer.record_activity()
                 break
 
-    def enqueue(self, obj, timeout: Optional[float] = None):
+    def enqueue(self, obj, timeout: float | None = None):
         """Write to message queue with optional timeout (in seconds)"""
         assert self._is_writer, "Only writers can enqueue"
         serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
@@ -520,8 +520,8 @@ class MessageQueue:
 
     def dequeue(
         self,
-        timeout: Optional[float] = None,
-        cancel: Optional[Event] = None,
+        timeout: float | None = None,
+        cancel: Event | None = None,
         indefinite: bool = False,
     ):
         """Read from message queue with optional timeout (in seconds)"""
@@ -542,7 +542,7 @@ class MessageQueue:
         return obj
 
     @staticmethod
-    def recv(socket: zmq.Socket, timeout: Optional[float]) -> Any:
+    def recv(socket: zmq.Socket, timeout: float | None) -> Any:
         timeout_ms = None if timeout is None else int(timeout * 1000)
         if not socket.poll(timeout=timeout_ms):
             raise TimeoutError
@@ -558,7 +558,7 @@ class MessageQueue:
 
     @staticmethod
     def create_from_process_group(
-        pg: Union[ProcessGroup, StatelessProcessGroup],
+        pg: ProcessGroup | StatelessProcessGroup,
         max_chunk_bytes,
         max_chunks,
         writer_rank=0,
diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py
index a5486c30edf29..080bc03e39137 100644
--- a/vllm/distributed/device_communicators/shm_object_storage.py
+++ b/vllm/distributed/device_communicators/shm_object_storage.py
@@ -3,13 +3,13 @@
 
 import pickle
 from abc import ABC, abstractmethod
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from contextlib import contextmanager
 from dataclasses import dataclass
 from itertools import chain
 from multiprocessing import shared_memory
 from multiprocessing.synchronize import Lock as LockType
-from typing import Any, Callable, Optional, Union
+from typing import Any
 from unittest.mock import patch
 
 import torch
@@ -109,7 +109,7 @@ class SingleWriterShmRingBuffer:
     def __init__(
         self,
         data_buffer_size: int,
-        name: Optional[str] = None,
+        name: str | None = None,
         create: bool = False,
     ):
         self.data_buffer_size = data_buffer_size
@@ -252,7 +252,7 @@ class SingleWriterShmRingBuffer:
     def free_buf(
         self,
         is_free_fn: Callable[[int, memoryview], bool],
-        nbytes: Optional[int] = None,
+        nbytes: int | None = None,
     ) -> Iterable[int]:
         """
         Free a buffer of the given size. This is a no-op in shared memory,
@@ -340,9 +340,7 @@ class MsgpackSerde(ObjectSerde):
         self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem)
         self._mm_kwargs_item_cls = MultiModalKwargsItem
 
-    def serialize(
-        self, value: Any
-    ) -> tuple[Union[bytes, list[bytes]], int, bytes, int]:
+    def serialize(self, value: Any) -> tuple[bytes | list[bytes], int, bytes, int]:
         len_arr = None
         if isinstance(value, (torch.Tensor, self._mm_kwargs_item_cls)):
             type_name = type(value).__name__
@@ -396,7 +394,7 @@ class ShmObjectStorageHandle:
     n_readers: int
     ring_buffer_handle: tuple[int, str]
     serde_class: type[ObjectSerde]
-    reader_lock: Optional[LockType]
+    reader_lock: LockType | None
 
 
 class SingleWriterShmObjectStorage:
@@ -444,7 +442,7 @@ class SingleWriterShmObjectStorage:
         n_readers: int,
         ring_buffer: SingleWriterShmRingBuffer,
         serde_class: type[ObjectSerde] = MsgpackSerde,
-        reader_lock: Optional[LockType] = None,
+        reader_lock: LockType | None = None,
     ):
         """
         Initialize the object storage.
@@ -492,7 +490,7 @@ class SingleWriterShmObjectStorage:
 
     def copy_to_buffer(
         self,
-        data: Union[bytes, list[bytes]],
+        data: bytes | list[bytes],
         data_bytes: int,
         metadata: bytes,
         md_bytes: int,
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index 88451f9552c13..aeea9b777b255 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -31,10 +30,10 @@ class SymmMemCommunicator:
     def __init__(
         self,
         group: ProcessGroup,
-        device: Union[int, str, torch.device],
+        device: int | str | torch.device,
         # add options for testing
-        force_multimem: Optional[bool] = None,
-        max_size_override: Optional[int] = None,
+        force_multimem: bool | None = None,
+        max_size_override: int | None = None,
     ):
         self.disabled = True
 
@@ -108,8 +107,8 @@ class SymmMemCommunicator:
         return inp_size < self.max_size
 
     def all_reduce(
-        self, inp: torch.Tensor, *, out: Optional[torch.Tensor] = None
-    ) -> Optional[torch.Tensor]:
+        self, inp: torch.Tensor, *, out: torch.Tensor | None = None
+    ) -> torch.Tensor | None:
         if not self.should_use_symm_mem(inp):
             return None
         if out is None:
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index b2faea512791a..f20cdfab340f3 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Optional
 
 import torch
 from torch.distributed import ProcessGroup
@@ -39,8 +38,8 @@ class TpuCommunicator(DeviceCommunicatorBase):
     def __init__(
         self,
         cpu_group: ProcessGroup,
-        device: Optional[torch.device] = None,
-        device_group: Optional[ProcessGroup] = None,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
         unique_name: str = "",
     ):
         super().__init__(cpu_group, device, device_group, unique_name)
diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
index 33d5b2cf1d879..83e336511059b 100644
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 import torch.distributed as dist
@@ -19,8 +18,8 @@ class XpuCommunicator(DeviceCommunicatorBase):
     def __init__(
         self,
         cpu_group: ProcessGroup,
-        device: Optional[torch.device] = None,
-        device_group: Optional[ProcessGroup] = None,
+        device: torch.device | None = None,
+        device_group: ProcessGroup | None = None,
         unique_name: str = "",
     ):
         super().__init__(cpu_group, device, device_group, unique_name)
@@ -45,7 +44,7 @@ class XpuCommunicator(DeviceCommunicatorBase):
 
     def gather(
         self, input_: torch.Tensor, dst: int = 0, dim: int = -1
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         assert -input_.dim() <= dim < input_.dim(), (
             f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
         )
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 663f040270461..17716e8a07ac0 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -29,7 +29,6 @@ physical experts.
 import time
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Optional, Union
 
 import torch
 from torch.distributed import ProcessGroup, all_reduce
@@ -186,9 +185,9 @@ class EplbState:
         model: MixtureOfExperts,
         device: torch.device,
         parallel_config: ParallelConfig,
-        global_expert_load: Optional[torch.Tensor] = None,
-        old_global_expert_indices: Optional[torch.Tensor] = None,
-        rank_mapping: Optional[dict[int, int]] = None,
+        global_expert_load: torch.Tensor | None = None,
+        old_global_expert_indices: torch.Tensor | None = None,
+        rank_mapping: dict[int, int] | None = None,
     ) -> "EplbState":
         """
         Build the initial EPLB state.
@@ -439,9 +438,9 @@ class EplbState:
         model: MixtureOfExperts,
         is_profile: bool = False,
         execute_shuffle: bool = True,
-        global_expert_load: Optional[torch.Tensor] = None,
-        rank_mapping: Optional[dict[int, int]] = None,
-    ) -> Optional[torch.Tensor]:
+        global_expert_load: torch.Tensor | None = None,
+        rank_mapping: dict[int, int] | None = None,
+    ) -> torch.Tensor | None:
         """
         Rearrange the experts according to the current load.
         """
@@ -611,7 +610,7 @@ class EplbState:
 
 
 def _node_count_with_rank_mapping(
-    pg: Union[ProcessGroup, StatelessProcessGroup],
+    pg: ProcessGroup | StatelessProcessGroup,
     rank_mapping: dict[int, int],
 ) -> int:
     if isinstance(pg, ProcessGroup):
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 344fae457c9b5..f8ec3e956401a 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -8,7 +8,6 @@ This involves the exchange of expert weights between GPUs.
 
 from collections.abc import Iterable, MutableSequence, Sequence
 from functools import partial
-from typing import Optional
 
 import torch
 from torch.distributed import (
@@ -253,7 +252,7 @@ def rearrange_expert_weights_inplace(
     expert_weights: Sequence[Iterable[torch.Tensor]],
     ep_group: ProcessGroup,
     is_profile: bool = False,
-    rank_mapping: Optional[dict[int, int]] = None,
+    rank_mapping: dict[int, int] | None = None,
 ) -> None:
     """
     Rearranges the expert weights in place according to the new expert indices.
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index d93ae63e0eb4d..6be2557ede40d 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -6,10 +6,11 @@ import threading
 import time
 from abc import ABC, abstractmethod
 from collections import deque
+from collections.abc import Callable
 from dataclasses import asdict
 from itertools import count
 from queue import Queue
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import msgspec
 import zmq
@@ -29,7 +30,7 @@ class EventBatch(
 ):
     ts: float
     events: list[Any]
-    data_parallel_rank: Optional[int] = None
+    data_parallel_rank: int | None = None
 
 
 class KVCacheEvent(
@@ -47,16 +48,16 @@ MEDIUM_GPU = "GPU"
 
 class BlockStored(KVCacheEvent):
     block_hashes: list[ExternalBlockHash]
-    parent_block_hash: Optional[ExternalBlockHash]
+    parent_block_hash: ExternalBlockHash | None
     token_ids: list[int]
     block_size: int
-    lora_id: Optional[int]
-    medium: Optional[str]
+    lora_id: int | None
+    medium: str | None
 
 
 class BlockRemoved(KVCacheEvent):
     block_hashes: list[ExternalBlockHash]
-    medium: Optional[str]
+    medium: str | None
 
 
 class AllBlocksCleared(KVCacheEvent):
@@ -64,7 +65,7 @@ class AllBlocksCleared(KVCacheEvent):
 
 
 class KVEventBatch(EventBatch):
-    events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]]
+    events: list[BlockStored | BlockRemoved | AllBlocksCleared]
 
 
 class EventPublisher(ABC):
@@ -139,7 +140,7 @@ class ZmqEventPublisher(EventPublisher):
         self,
         data_parallel_rank: int,
         endpoint: str = "tcp://*:5557",
-        replay_endpoint: Optional[str] = None,
+        replay_endpoint: str | None = None,
         buffer_steps: int = 10_000,
         hwm: int = 100_000,
         max_queue_size: int = 100_000,
@@ -147,13 +148,13 @@ class ZmqEventPublisher(EventPublisher):
     ) -> None:
         # Storage
         super().__init__(data_parallel_rank)
-        self._event_queue = Queue[Optional[EventBatch]](maxsize=max_queue_size)
+        self._event_queue = Queue[EventBatch | None](maxsize=max_queue_size)
         self._buffer = deque[tuple[int, bytes]](maxlen=buffer_steps)
 
         # ZMQ sockets
         self._ctx = zmq.Context.instance()
-        self._pub: Optional[zmq.Socket] = None
-        self._replay: Optional[zmq.Socket] = None
+        self._pub: zmq.Socket | None = None
+        self._replay: zmq.Socket | None = None
         self._dp_rank = data_parallel_rank
 
         self._endpoint = self.offset_endpoint_port(endpoint, self._dp_rank)
@@ -303,8 +304,8 @@ class ZmqEventPublisher(EventPublisher):
 
     @staticmethod
     def offset_endpoint_port(
-        endpoint: Optional[str], data_parallel_rank: int
-    ) -> Optional[str]:
+        endpoint: str | None, data_parallel_rank: int
+    ) -> str | None:
         """Helper function to offset the port in an endpoint by
             the data parallel rank.
 
@@ -349,7 +350,7 @@ class EventPublisherFactory:
 
     @classmethod
     def create(
-        cls, config: Optional[KVEventsConfig], data_parallel_rank: int = 0
+        cls, config: KVEventsConfig | None, data_parallel_rank: int = 0
     ) -> EventPublisher:
         """Create publisher from a config mapping."""
         if not config:
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 395a4e20e0ba3..aaf43842cf7cd 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
-from typing import TYPE_CHECKING, Callable
+from collections.abc import Callable
+from typing import TYPE_CHECKING
 
 import vllm.envs as envs
 from vllm.distributed.kv_transfer.kv_connector.base import (
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 056ece60e84dd..b7e9daaa5b598 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -7,7 +7,7 @@ KV cache helper for store.
 from collections import defaultdict
 from collections.abc import Sequence
 from concurrent.futures import CancelledError, Future
-from typing import Literal, Optional, Union, cast
+from typing import Literal, cast
 
 import torch
 
@@ -136,7 +136,7 @@ class KVOutputAggregator:
         # Aggregate kv_connector_output from all workers
 
         def update_finished_set(
-            req_ids: Optional[set[str]],
+            req_ids: set[str] | None,
             remaining_count_dict: dict[str, int],
             finished_set: set[str],
         ) -> None:
@@ -197,7 +197,7 @@ class KVOutputAggregator:
         to the respective list of outputs."""
         result_future: Future[ModelRunnerOutput] = Future()
 
-        outputs: list[Optional[ModelRunnerOutput]] = [None] * len(output_futures)
+        outputs: list[ModelRunnerOutput | None] = [None] * len(output_futures)
 
         def make_callback(idx):
             def callback(fut):
@@ -230,8 +230,8 @@ class KVOutputAggregator:
 def _make_src_and_dst_indices(
     src_block_ids: list[int],
     dst_block_ids: list[int],
-    src_device: Union[torch.device, str],
-    dst_device: Union[torch.device, str],
+    src_device: torch.device | str,
+    dst_device: torch.device | str,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     src_indices = torch.tensor(src_block_ids, device=src_device, dtype=torch.int64)
     dst_indices = torch.tensor(dst_block_ids, device=dst_device, dtype=torch.int64)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index e871b3017d8bb..c51e26ce2f447 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -37,8 +37,8 @@ The class provides the following primitives:
 
 import enum
 from abc import ABC, abstractmethod
-from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Callable, Literal, Optional
+from collections.abc import Callable, Iterable
+from typing import TYPE_CHECKING, Any, Literal, Optional
 
 import torch
 
@@ -93,7 +93,7 @@ class KVConnectorBase_V1(ABC):
             "Initializing KVConnectorBase_V1. This API is experimental and "
             "subject to change in the future as we iterate the design."
         )
-        self._connector_metadata: Optional[KVConnectorMetadata] = None
+        self._connector_metadata: KVConnectorMetadata | None = None
         self._vllm_config = vllm_config
         self._role = role
 
@@ -222,7 +222,7 @@ class KVConnectorBase_V1(ABC):
 
     def get_finished(
         self, finished_req_ids: set[str]
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
         """
         Notifies worker-side connector ids of requests that have
         finished generating tokens on the worker.
@@ -281,7 +281,7 @@ class KVConnectorBase_V1(ABC):
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> tuple[Optional[int], bool]:
+    ) -> tuple[int | None, bool]:
         """
         Get number of new tokens that can be loaded from the
         external KV cache beyond the num_computed_tokens.
@@ -361,7 +361,7 @@ class KVConnectorBase_V1(ABC):
         self,
         request: "Request",
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         """
         Called exactly once when a request has finished, before its blocks are
         freed.
@@ -388,7 +388,7 @@ class KVConnectorBase_V1(ABC):
         return ()
 
     @classmethod
-    def get_required_kvcache_layout(cls, vllm_config: "VllmConfig") -> Optional[str]:
+    def get_required_kvcache_layout(cls, vllm_config: "VllmConfig") -> str | None:
         """
         Get the required KV cache layout for this connector.
         Args:
@@ -406,7 +406,7 @@ class KVConnectorBase_V1(ABC):
             )
         return None
 
-    def get_finished_count(self) -> Optional[int]:
+    def get_finished_count(self) -> int | None:
         """
         Get the count of requests expected to complete send/receive operations
         via this connector.
@@ -419,7 +419,7 @@ class KVConnectorBase_V1(ABC):
 
     @classmethod
     def build_kv_connector_stats(
-        cls, data: Optional[dict[str, Any]] = None
+        cls, data: dict[str, Any] | None = None
     ) -> Optional["KVConnectorStats"]:
         """
         KVConnectorStats resolution method. This method allows dynamically
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index b50cc3ab30fa9..3abb7791057a1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import torch
 from lmcache.integration.vllm.vllm_v1_adapter import LMCacheConnectorV1Impl
@@ -96,7 +96,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
 
     def get_finished(
         self, finished_req_ids: set[str]
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
         """
         Notifies worker-side connector ids of requests that have
         finished generating tokens.
@@ -117,7 +117,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> tuple[Optional[int], bool]:
+    ) -> tuple[int | None, bool]:
         """
         Get number of new tokens that can be loaded from the
         external KV cache beyond the num_computed_tokens.
@@ -161,7 +161,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
         self,
         request: "Request",
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         """
         Called when a request has finished, before its blocks are freed.
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
index 879cc9a23581a..21002fe572c52 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass, field
-from typing import Any, Optional, Union
+from typing import Any
 
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
@@ -32,7 +32,7 @@ class KVConnectorStats:
         """
         raise NotImplementedError
 
-    def reduce(self) -> dict[str, Union[int, float]]:
+    def reduce(self) -> dict[str, int | float]:
         """
         Reduce the observations collected during a time interval to one or
         more representative values (eg avg/median/sum of the series).
@@ -58,7 +58,7 @@ class KVConnectorLogging:
         self.reset()
 
     def reset(self):
-        self.transfer_stats_accumulator: Optional[KVConnectorStats] = None
+        self.transfer_stats_accumulator: KVConnectorStats | None = None
 
     def observe(self, transfer_stats_data: dict[str, Any]):
         # Should not be called when a KVConnector is not configured.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index e48d4ccd1d6c0..25625762f447b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -3,7 +3,7 @@
 import copy
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import torch
 
@@ -33,7 +33,7 @@ logger = init_logger(__name__)
 @dataclass
 class MultiKVConnectorMetadata(KVConnectorMetadata):
     metadata: tuple[KVConnectorMetadata, ...]
-    extra_async_saves: Optional[dict[str, int]] = None
+    extra_async_saves: dict[str, int] | None = None
 
 
 @dataclass
@@ -130,7 +130,7 @@ class MultiConnector(KVConnectorBase_V1):
             c.clear_connector_metadata()
 
     def shutdown(self):
-        exception: Optional[Exception] = None
+        exception: Exception | None = None
         for c in self._connectors:
             try:
                 c.shutdown()
@@ -169,7 +169,7 @@ class MultiConnector(KVConnectorBase_V1):
 
     def get_finished(
         self, finished_req_ids: set[str]
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
         finished_sending: set[str] = set()
         finished_recving: set[str] = set()
         for c in self._connectors:
@@ -207,7 +207,7 @@ class MultiConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> tuple[Optional[int], bool]:
+    ) -> tuple[int | None, bool]:
         to_return = (0, False)
         for i, c in enumerate(self._connectors):
             toks, load_async = c.get_num_new_matched_tokens(
@@ -258,7 +258,7 @@ class MultiConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         blocks: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         async_saves = 0
         kv_txfer_params = None
         for c in self._connectors:
@@ -286,7 +286,7 @@ class MultiConnector(KVConnectorBase_V1):
             yield from c.take_events()
 
     @classmethod
-    def get_required_kvcache_layout(cls, vllm_config: "VllmConfig") -> Optional[str]:
+    def get_required_kvcache_layout(cls, vllm_config: "VllmConfig") -> str | None:
         """
         Get the required KV cache layout for this connector.
         Args:
@@ -323,17 +323,17 @@ class MultiConnector(KVConnectorBase_V1):
 
     @classmethod
     def build_kv_connector_stats(
-        cls, data: Optional[dict[str, Any]] = None
-    ) -> Optional[KVConnectorStats]:
+        cls, data: dict[str, Any] | None = None
+    ) -> KVConnectorStats | None:
         return (
             MultiKVConnectorStats(data=data)
             if data is not None
             else MultiKVConnectorStats()
         )
 
-    def get_kv_connector_stats(self) -> Optional[MultiKVConnectorStats]:
+    def get_kv_connector_stats(self) -> MultiKVConnectorStats | None:
         # Group connector stats by connector type.
-        stats_by_connector: Optional[MultiKVConnectorStats] = None
+        stats_by_connector: MultiKVConnectorStats | None = None
         for c in self._connectors:
             stats = c.get_kv_connector_stats()
             if stats is None:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 365d1a1ff280c..a8730bf789874 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -13,7 +13,7 @@ from collections import defaultdict
 from collections.abc import Iterator
 from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
 import msgspec
 import numpy as np
@@ -153,10 +153,10 @@ class NixlConnector(KVConnectorBase_V1):
         self.engine_id: EngineId = vllm_config.kv_transfer_config.engine_id
 
         if role == KVConnectorRole.SCHEDULER:
-            self.connector_scheduler: Optional[NixlConnectorScheduler] = (
+            self.connector_scheduler: NixlConnectorScheduler | None = (
                 NixlConnectorScheduler(vllm_config, self.engine_id)
             )
-            self.connector_worker: Optional[NixlConnectorWorker] = None
+            self.connector_worker: NixlConnectorWorker | None = None
         elif role == KVConnectorRole.WORKER:
             self.connector_scheduler = None
             self.connector_worker = NixlConnectorWorker(vllm_config, self.engine_id)
@@ -189,7 +189,7 @@ class NixlConnector(KVConnectorBase_V1):
 
     def get_num_new_matched_tokens(
         self, request: "Request", num_computed_tokens: int
-    ) -> tuple[Optional[int], bool]:
+    ) -> tuple[int | None, bool]:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.get_num_new_matched_tokens(
             request, num_computed_tokens
@@ -214,7 +214,7 @@ class NixlConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.request_finished(request, block_ids)
 
@@ -234,14 +234,14 @@ class NixlConnector(KVConnectorBase_V1):
         assert self.connector_worker is not None
         return self.connector_worker.get_finished()
 
-    def get_kv_connector_stats(self) -> Optional[KVConnectorStats]:
+    def get_kv_connector_stats(self) -> KVConnectorStats | None:
         assert self.connector_worker is not None
         return self.connector_worker.get_kv_connector_stats()
 
     @classmethod
     def build_kv_connector_stats(
-        cls, data: Optional[dict[str, Any]] = None
-    ) -> Optional[KVConnectorStats]:
+        cls, data: dict[str, Any] | None = None
+    ) -> KVConnectorStats | None:
         return (
             NixlKVConnectorStats(data=data)
             if data is not None
@@ -445,7 +445,7 @@ class NixlConnectorScheduler:
         self,
         request: "Request",
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         """
         Once a request is finished, determine whether request blocks
         should be freed now or will be sent asynchronously and freed later.
@@ -584,7 +584,7 @@ class NixlConnectorWorker:
             )
 
         # Note: host xfer buffer ops when use_host_buffer is True
-        self.copy_blocks: Optional[CopyBlocksOp] = None
+        self.copy_blocks: CopyBlocksOp | None = None
 
         # Map of engine_id -> kv_caches_base_addr. For TP case, each local
         # rank will still only pull from a single remote TP worker.
@@ -615,7 +615,7 @@ class NixlConnectorWorker:
         self._reqs_to_process: set[ReqId] = set()
 
         # Background thread for handling new handshake requests.
-        self._nixl_handshake_listener_t: Optional[threading.Thread] = None
+        self._nixl_handshake_listener_t: threading.Thread | None = None
         # Background thread for initializing new NIXL handshakes.
         self._handshake_initiation_executor = ThreadPoolExecutor(
             # NIXL is not guaranteed to be thread-safe, limit 1 worker.
@@ -635,7 +635,7 @@ class NixlConnectorWorker:
         # TODO(mgoin): remove this once we have hybrid memory allocator
         # Optimization for models with local attention (Llama 4)
         # List of block window sizes for each layer for local attention
-        self.block_window_per_layer: list[Optional[int]] = []
+        self.block_window_per_layer: list[int | None] = []
         self.use_mla = self.model_config.use_mla
 
         backend = get_attn_backend(
@@ -1472,7 +1472,7 @@ class NixlConnectorWorker:
         self._recving_transfers[request_id].append((handle, time.perf_counter()))
 
     def _get_block_descs_ids(
-        self, engine_id: str, block_ids: list[int], layer_idx: Optional[int] = None
+        self, engine_id: str, block_ids: list[int], layer_idx: int | None = None
     ) -> np.ndarray:
         """
         Get the descs ids for a set of block ids.
@@ -1518,7 +1518,7 @@ class NixlConnectorWorker:
             block_len = self.block_len_per_layer[layer_idx]
         return block_len
 
-    def get_kv_connector_stats(self) -> Optional[KVConnectorStats]:
+    def get_kv_connector_stats(self) -> KVConnectorStats | None:
         """
         Get the KV transfer stats for the connector.
         """
@@ -1559,7 +1559,7 @@ def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]:
     if socket_type not in (zmq.ROUTER, zmq.REQ):
         raise ValueError(f"Unexpected socket type: {socket_type}")
 
-    ctx: Optional[zmq.Context] = None
+    ctx: zmq.Context | None = None
     try:
         ctx = zmq.Context()  # type: ignore[attr-defined]
         yield make_zmq_socket(
@@ -1611,7 +1611,7 @@ class NixlKVConnectorStats(KVConnectorStats):
                 accumulator.extend(v)
         return self
 
-    def reduce(self) -> dict[str, Union[int, float]]:
+    def reduce(self) -> dict[str, int | float]:
         # Compute compact representative stats suitable for CLI logging
         if self.is_empty():
             return {
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index 745af0efba180..6d4ffc152de97 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -4,7 +4,7 @@ from collections import defaultdict
 from collections.abc import Iterable, Iterator
 from dataclasses import dataclass
 from itertools import islice
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
@@ -46,8 +46,8 @@ class OffloadingConnector(KVConnectorBase_V1):
 
         spec = OffloadingSpecFactory.create_spec(vllm_config)
 
-        self.connector_scheduler: Optional[OffloadingConnectorScheduler] = None
-        self.connector_worker: Optional[OffloadingConnectorWorker] = None
+        self.connector_scheduler: OffloadingConnectorScheduler | None = None
+        self.connector_worker: OffloadingConnectorWorker | None = None
         if role == KVConnectorRole.SCHEDULER:
             self.connector_scheduler = OffloadingConnectorScheduler(spec)
         elif role == KVConnectorRole.WORKER:
@@ -113,7 +113,7 @@ class OffloadingConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         assert self.connector_scheduler is not None
         return self.connector_scheduler.request_finished(request, block_ids)
 
@@ -148,7 +148,7 @@ class OffloadingConnectorScheduler:
         self,
         req: Request,
         start_idx: int = 0,
-        end_idx: Optional[int] = None,
+        end_idx: int | None = None,
     ) -> Iterable[BlockHash]:
         return islice(
             req.block_hashes,
@@ -354,7 +354,7 @@ class OffloadingConnectorScheduler:
         self,
         request: Request,
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         """
         Called when a request has finished, before its blocks are freed.
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index 0e6693db5cd24..c9fa9efeeb6f8 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import regex as re
 import torch
@@ -304,7 +304,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
 
     def get_finished(
         self, finished_req_ids: set[str], **kwargs: Any
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
         """
         Notifies worker-side connector ids of requests that have
         finished generating tokens.
@@ -466,7 +466,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         block_ids: list[int],
-    ) -> tuple[bool, Optional[dict[str, Any]]]:
+    ) -> tuple[bool, dict[str, Any] | None]:
         """
         Called when a request has finished, before its blocks are freed.
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
index cff68818ca70b..7714359a5091e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py
@@ -5,11 +5,10 @@ import logging
 import os
 import threading
 import time
-import typing
 from collections import deque
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 
 import msgpack
 import torch
@@ -77,7 +76,7 @@ class P2pNcclEngine:
         config: KVTransferConfig,
         hostname: str = "",
         port_offset: int = 0,
-        library_path: Optional[str] = None,
+        library_path: str | None = None,
     ) -> None:
         self.config = config
         self.rank = port_offset
@@ -187,7 +186,7 @@ class P2pNcclEngine:
             self.nccl_num_channels,
         )
 
-    def create_connect(self, remote_address: typing.Optional[str] = None):
+    def create_connect(self, remote_address: str | None = None):
         assert remote_address is not None
         if remote_address not in self.socks:
             sock = self.context.socket(zmq.DEALER)
@@ -224,7 +223,7 @@ class P2pNcclEngine:
         self,
         tensor_id: str,
         tensor: torch.Tensor,
-        remote_address: typing.Optional[str] = None,
+        remote_address: str | None = None,
     ) -> bool:
         if remote_address is None:
             with self.recv_store_cv:
@@ -296,7 +295,7 @@ class P2pNcclEngine:
     def recv_tensor(
         self,
         tensor_id: str,
-        remote_address: typing.Optional[str] = None,
+        remote_address: str | None = None,
     ) -> torch.Tensor:
         if self.send_type == "PUT" or self.send_type == "PUT_ASYNC":
             start_time = time.time()
@@ -527,7 +526,7 @@ class P2pNcclEngine:
 
     def get_finished(
         self, finished_req_ids: set[str], no_compile_layers
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
         """
         Notifies worker-side connector ids of requests that have
         finished generating tokens.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index a1bab4e061455..a4beebecbe22d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -3,7 +3,7 @@
 import hashlib
 import os
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import safetensors
 import torch
@@ -249,7 +249,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
         self,
         request: "Request",
         num_computed_tokens: int,
-    ) -> tuple[Optional[int], bool]:
+    ) -> tuple[int | None, bool]:
         """
         Get number of new tokens that can be loaded from the
         external KV cache beyond the num_computed_tokens.
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index 08b683bfe23f5..f48d03d0b0cd5 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -14,7 +14,6 @@ These classes above are abstracted behind class `KVCacheBufferBase`.
 """
 
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 
@@ -98,8 +97,8 @@ class KVLookupBufferBase(KVCacheBufferBase):
 
     @abstractmethod
     def drop_select(
-        self, input_tokens: Optional[torch.Tensor], roi: Optional[torch.Tensor]
-    ) -> list[Optional[torch.Tensor]]:
+        self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
+    ) -> list[torch.Tensor | None]:
         """Select and *drop* KV cache entries from the lookup buffer.
 
         The functionality is similar to the following python statements
@@ -143,7 +142,7 @@ class KVStoreBufferBase(KVCacheBufferBase):
     def put(
         self,
         key: str,
-        value: Optional[torch.Tensor],
+        value: torch.Tensor | None,
     ) -> None:
         """Store a key-value pair in the buffer.
 
@@ -163,7 +162,7 @@ class KVStoreBufferBase(KVCacheBufferBase):
     def get(
         self,
         key: str,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """Retrieve a value from the buffer by key.
 
         Args:
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
index 44fc6d8ac5ad3..7861bea1f9c54 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
@@ -10,7 +10,6 @@ from this remote lookup buffer.
 import json
 import os
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 from safetensors.torch import load as safetensors_load
@@ -110,7 +109,7 @@ class MooncakeStore(KVStoreBufferBase):
     def put(
         self,
         key: str,
-        value: Optional[torch.Tensor],
+        value: torch.Tensor | None,
     ) -> None:
         # A message queue needs to be introduced before making it asynchronous.
         if value is not None:
@@ -119,7 +118,7 @@ class MooncakeStore(KVStoreBufferBase):
     def get(
         self,
         key: str,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         # A message queue needs to be introduced before making it asynchronous.
         value = self._get_impl(key)
         return value
@@ -142,7 +141,7 @@ class MooncakeStore(KVStoreBufferBase):
     def _get_impl(
         self,
         key: str,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """Get KVCache from Mooncake Store"""
         try:
             data = self.store.get(key)
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index cd58ec2e76398..f046a349874e6 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -13,7 +13,6 @@ Key Features:
 
 import threading
 from collections import deque
-from typing import Optional, Union
 
 import torch
 
@@ -46,7 +45,7 @@ class SimpleBuffer(KVLookupBufferBase):
         self.buffer_cv = threading.Condition()
         self.signal_pipe = signal_pipe
         self.data_pipe = data_pipe
-        self.request_handling_thread: Optional[threading.Thread] = None
+        self.request_handling_thread: threading.Thread | None = None
 
         self.normal_signal = torch.tensor([0], device="cpu")
         self.end_signal = None
@@ -81,14 +80,14 @@ class SimpleBuffer(KVLookupBufferBase):
 
         return 0
 
-    def _send_tensor_and_dec_size(self, tensor: Optional[torch.Tensor]) -> None:
+    def _send_tensor_and_dec_size(self, tensor: torch.Tensor | None) -> None:
         assert tensor is not None, "Use self.data_pipe.send(None) instead"
         self.buffer_size -= tensor.element_size() * tensor.numel()
         if tensor.dtype == torch.bool:
             tensor = tensor.float()
         self.data_pipe.send_tensor(tensor)
 
-    def _get_element_size(self, data: Optional[Union[list, torch.Tensor]]):
+    def _get_element_size(self, data: list | torch.Tensor | None):
         if isinstance(data, torch.Tensor):
             return data.element_size() * data.numel()
         if not data:
@@ -184,8 +183,8 @@ class SimpleBuffer(KVLookupBufferBase):
         logger.debug("Closing drop_select_handler")
 
     def drop_select(
-        self, input_tokens: Optional[torch.Tensor], roi: Optional[torch.Tensor]
-    ) -> list[Optional[torch.Tensor]]:
+        self, input_tokens: torch.Tensor | None, roi: torch.Tensor | None
+    ) -> list[torch.Tensor | None]:
         assert self.request_handling_thread is None, (
             "drop_select should be called by the KV cache consumer "
             "(e.g. the decode vLLM instance)"
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index e27c6b2101b84..1fe7a90e9a712 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -12,7 +12,6 @@ you can bypass this interface and directly start from `kv_lookup_buffer`.
 """
 
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 
@@ -24,7 +23,7 @@ class KVPipeBase(ABC):
     """
 
     @abstractmethod
-    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+    def send_tensor(self, tensor: torch.Tensor | None) -> None:
         """Send a tensor, or None, via the pipe.
 
         Need to support sending None -- important for error handling.
@@ -42,7 +41,7 @@ class KVPipeBase(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def recv_tensor(self) -> Optional[torch.Tensor]:
+    def recv_tensor(self) -> torch.Tensor | None:
         """Receive a tensor (can be None) from the pipeline.
 
         Returns:
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index 65858f86aa235..8203c57e2dc6e 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -6,7 +6,6 @@ import os
 import struct
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
-from typing import Optional, Union
 
 import torch
 import zmq
@@ -26,7 +25,7 @@ NONE_INT = -150886311
 class MooncakeTransferEngineConfig:
     prefill_url: str
     decode_url: str
-    metadata_backend: Union[str, None]
+    metadata_backend: str | None
     metadata_server: str
     protocol: str
     device_name: str
@@ -143,7 +142,7 @@ class MooncakeTransferEngine:
         metadata_server: str,
         protocol: str,
         device_name: str,
-        metadata_backend: Union[str, None],
+        metadata_backend: str | None,
     ) -> None:
         """Initialize the mooncake instance."""
         if metadata_backend is None:
@@ -231,7 +230,7 @@ class MooncakePipe(KVPipeBase):
     """MooncakeTransferEngine based Pipe implementation."""
 
     def __init__(
-        self, local_rank: int, config: KVTransferConfig, device: Optional[str] = None
+        self, local_rank: int, config: KVTransferConfig, device: str | None = None
     ):
         """Initialize the mooncake pipe and set related parameters."""
         self.config = config
@@ -243,7 +242,7 @@ class MooncakePipe(KVPipeBase):
             self.device = self._select_device(device)
 
         self.transfer_engine = MooncakeTransferEngine(self.kv_rank, self.local_rank)
-        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.transport_thread: ThreadPoolExecutor | None = None
         self.none_tensor = torch.tensor([NONE_INT], device=self.device)
 
     def _select_device(self, device: str) -> torch.device:
@@ -267,7 +266,7 @@ class MooncakePipe(KVPipeBase):
         data = self.transfer_engine.recv_bytes()
         return safetensors_load(data)["tensor"].to(self.device)
 
-    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+    def send_tensor(self, tensor: torch.Tensor | None) -> None:
         """Send tensor to the target process."""
         if self.transport_thread is None:
             self.transport_thread = ThreadPoolExecutor(max_workers=1)
@@ -275,7 +274,7 @@ class MooncakePipe(KVPipeBase):
         assert len(tensor.shape) > 0
         self.transport_thread.submit(self._send_impl, tensor)
 
-    def recv_tensor(self) -> Optional[torch.Tensor]:
+    def recv_tensor(self) -> torch.Tensor | None:
         """Receive tensor from other processes."""
         if self.transport_thread is None:
             self.transport_thread = ThreadPoolExecutor(max_workers=1)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index c79b7e7e50303..4682eeee2768d 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -15,8 +15,8 @@ Key Features:
 
 import threading
 import time
+from collections.abc import Callable
 from concurrent.futures import ThreadPoolExecutor
-from typing import Callable, Optional
 
 import torch
 
@@ -35,7 +35,7 @@ class BrokenPipeException(Exception):
         super().__init__(self.message)
 
 
-Metadata = dict[str, Optional[torch.Tensor]]
+Metadata = dict[str, torch.Tensor | None]
 
 
 class PyNcclPipe(KVPipeBase):
@@ -47,7 +47,7 @@ class PyNcclPipe(KVPipeBase):
         self,
         local_rank: int,
         config: KVTransferConfig,
-        device: Optional[str] = None,
+        device: str | None = None,
         port_offset: int = 0,
     ):
         self.config = config
@@ -77,7 +77,7 @@ class PyNcclPipe(KVPipeBase):
         self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size
 
         # transportation-related variables
-        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.transport_thread: ThreadPoolExecutor | None = None
         self.buffer_size = 0
         self.buffer_size_lock = threading.Lock()
         self.buffer_size_thresh = self.config.kv_buffer_size
@@ -115,7 +115,7 @@ class PyNcclPipe(KVPipeBase):
         else:
             return torch.device("cpu")
 
-    def _make_metadata(self, tensor: Optional[torch.Tensor]) -> Metadata:
+    def _make_metadata(self, tensor: torch.Tensor | None) -> Metadata:
         """
         Create the metadata as a dictionary based on the input tensor.
 
@@ -167,7 +167,7 @@ class PyNcclPipe(KVPipeBase):
         """
         return self.group.recv_obj(self.target_rank_for_recv)
 
-    def _send_impl(self, tensor: Optional[torch.Tensor]) -> None:
+    def _send_impl(self, tensor: torch.Tensor | None) -> None:
         """
         The actual implementation of sending the tensor and its metadata to the
         target rank.
@@ -181,7 +181,7 @@ class PyNcclPipe(KVPipeBase):
         if tensor is not None:
             self.device_send_func(tensor.to(self.device), self.target_rank_for_send)
 
-    def _recv_impl(self) -> Optional[torch.Tensor]:
+    def _recv_impl(self) -> torch.Tensor | None:
         """
         The actual implementation of receiving a tensor and its metadata from
         the target rank.
@@ -198,7 +198,7 @@ class PyNcclPipe(KVPipeBase):
         return buffer
 
     def send_tensor_wrapper(
-        self, tensor: Optional[torch.Tensor], tensor_size: int
+        self, tensor: torch.Tensor | None, tensor_size: int
     ) -> None:
         """
         Wrapper for _send_impl to handle exceptions and update buffer size.
@@ -228,7 +228,7 @@ class PyNcclPipe(KVPipeBase):
             logger.debug("KV cache transfer pipe is full. Waiting...")
             time.sleep(0.05)
 
-    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+    def send_tensor(self, tensor: torch.Tensor | None) -> None:
         """
         Sends a tensor and its metadata to the destination rank in a
         non-blocking way.
@@ -251,7 +251,7 @@ class PyNcclPipe(KVPipeBase):
 
         self.transport_thread.submit(self.send_tensor_wrapper, tensor, tensor_size)
 
-    def recv_tensor(self) -> Optional[torch.Tensor]:
+    def recv_tensor(self) -> torch.Tensor | None:
         """
         Receives a tensor and its metadata from the source rank. Blocking call.
 
diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py
index f8f65f28ff6d7..cabfc10e7f942 100644
--- a/vllm/distributed/kv_transfer/kv_transfer_state.py
+++ b/vllm/distributed/kv_transfer/kv_transfer_state.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 from vllm import envs
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
@@ -13,7 +13,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import (
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
-_KV_CONNECTOR_AGENT: Optional[KVConnectorBaseType] = None
+_KV_CONNECTOR_AGENT: KVConnectorBaseType | None = None
 
 
 def get_kv_transfer_group() -> KVConnectorBaseType:
@@ -27,7 +27,7 @@ def has_kv_transfer_group() -> bool:
     return _KV_CONNECTOR_AGENT is not None
 
 
-def is_v1_kv_transfer_group(connector: Optional[KVConnectorBaseType] = None) -> bool:
+def is_v1_kv_transfer_group(connector: KVConnectorBaseType | None = None) -> bool:
     """Check if the KV connector is the v1 connector.
     If the argument is None, it will check the global KV connector
 
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index cb5a75c59f096..67a8c6f7c053f 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -28,11 +28,12 @@ import gc
 import pickle
 import weakref
 from collections import namedtuple
+from collections.abc import Callable
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from datetime import timedelta
 from multiprocessing import shared_memory
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional
 from unittest.mock import patch
 
 import torch
@@ -65,7 +66,7 @@ TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
 
 
 def _split_tensor_dict(
-    tensor_dict: dict[str, Union[torch.Tensor, Any]],
+    tensor_dict: dict[str, torch.Tensor | Any],
 ) -> tuple[list[tuple[str, Any]], list[torch.Tensor]]:
     """Split the tensor dictionary into two parts:
     1. A list of (key, value) pairs. If the value is a tensor, it is replaced
@@ -300,17 +301,17 @@ class GroupCoordinator:
     cpu_group: ProcessGroup  # group for CPU communication
     device_group: ProcessGroup  # group for device communication
     # device communicator (if use_device_communicator=True)
-    device_communicator: Optional[DeviceCommunicatorBase]
-    mq_broadcaster: Optional[Any]  # shared memory broadcaster
+    device_communicator: DeviceCommunicatorBase | None
+    mq_broadcaster: Any | None  # shared memory broadcaster
 
     def __init__(
         self,
         group_ranks: list[list[int]],
         local_rank: int,
-        torch_distributed_backend: Union[str, Backend],
+        torch_distributed_backend: str | Backend,
         use_device_communicator: bool,  # whether to use device communicator
         use_message_queue_broadcaster: bool = False,
-        group_name: Optional[str] = None,
+        group_name: str | None = None,
     ):
         group_name = group_name or "anonymous"
         self.unique_name = _get_unique_name(group_name)
@@ -368,7 +369,7 @@ class GroupCoordinator:
 
         from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
 
-        self.mq_broadcaster: Optional[MessageQueue] = None
+        self.mq_broadcaster: MessageQueue | None = None
         if use_message_queue_broadcaster and self.world_size > 1:
             self.mq_broadcaster = MessageQueue.create_from_process_group(
                 self.cpu_group, 1 << 22, 6
@@ -419,9 +420,7 @@ class GroupCoordinator:
         return self.ranks[(rank_in_group - 1) % world_size]
 
     @contextmanager
-    def graph_capture(
-        self, graph_capture_context: Optional[GraphCaptureContext] = None
-    ):
+    def graph_capture(self, graph_capture_context: GraphCaptureContext | None = None):
         if graph_capture_context is None:
             stream = torch.cuda.Stream()
             graph_capture_context = GraphCaptureContext(stream)
@@ -502,9 +501,9 @@ class GroupCoordinator:
 
     def all_gatherv(
         self,
-        input_: Union[torch.Tensor, list[torch.Tensor]],
+        input_: torch.Tensor | list[torch.Tensor],
         dim: int = 0,
-        sizes: Optional[list[int]] = None,
+        sizes: list[int] | None = None,
     ):
         if self.device_communicator is None:
             raise ValueError("No device communicator found")
@@ -527,7 +526,7 @@ class GroupCoordinator:
             return self._reduce_scatter_out_place(input_, dim)
 
     def reduce_scatterv(
-        self, input_: torch.Tensor, dim: int = -1, sizes: Optional[list[int]] = None
+        self, input_: torch.Tensor, dim: int = -1, sizes: list[int] | None = None
     ) -> torch.Tensor:
         if self.device_communicator is None:
             raise ValueError("No device communicator found")
@@ -540,7 +539,7 @@ class GroupCoordinator:
 
     def gather(
         self, input_: torch.Tensor, dst: int = 0, dim: int = -1
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         NOTE: We assume that the input tensor is on the same device across
         all the ranks.
@@ -569,7 +568,7 @@ class GroupCoordinator:
         )
         return input_
 
-    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
+    def broadcast_object(self, obj: Any | None = None, src: int = 0):
         """Broadcast the input object.
         NOTE: `src` is the local rank of the source rank.
         """
@@ -594,7 +593,7 @@ class GroupCoordinator:
             return recv[0]
 
     def broadcast_object_list(
-        self, obj_list: list[Any], src: int = 0, group: Optional[ProcessGroup] = None
+        self, obj_list: list[Any], src: int = 0, group: ProcessGroup | None = None
     ):
         """Broadcast the input object list.
         NOTE: `src` is the local rank of the source rank.
@@ -675,11 +674,11 @@ class GroupCoordinator:
 
     def broadcast_tensor_dict(
         self,
-        tensor_dict: Optional[dict[str, Union[torch.Tensor, Any]]] = None,
+        tensor_dict: dict[str, torch.Tensor | Any] | None = None,
         src: int = 0,
-        group: Optional[ProcessGroup] = None,
-        metadata_group: Optional[ProcessGroup] = None,
-    ) -> Optional[dict[str, Union[torch.Tensor, Any]]]:
+        group: ProcessGroup | None = None,
+        metadata_group: ProcessGroup | None = None,
+    ) -> dict[str, torch.Tensor | Any] | None:
         """Broadcast the input tensor dictionary.
         NOTE: `src` is the local rank of the source rank.
         """
@@ -757,11 +756,11 @@ class GroupCoordinator:
 
     def send_tensor_dict(
         self,
-        tensor_dict: dict[str, Union[torch.Tensor, Any]],
-        dst: Optional[int] = None,
+        tensor_dict: dict[str, torch.Tensor | Any],
+        dst: int | None = None,
         all_gather_group: Optional["GroupCoordinator"] = None,
-        all_gather_tensors: Optional[dict[str, bool]] = None,
-    ) -> Optional[dict[str, Union[torch.Tensor, Any]]]:
+        all_gather_tensors: dict[str, bool] | None = None,
+    ) -> dict[str, torch.Tensor | Any] | None:
         """Send the input tensor dictionary.
         NOTE: `dst` is the local rank of the source rank.
 
@@ -845,10 +844,10 @@ class GroupCoordinator:
 
     def recv_tensor_dict(
         self,
-        src: Optional[int] = None,
+        src: int | None = None,
         all_gather_group: Optional["GroupCoordinator"] = None,
-        all_gather_tensors: Optional[dict[str, bool]] = None,
-    ) -> Optional[dict[str, Union[torch.Tensor, Any]]]:
+        all_gather_tensors: dict[str, bool] | None = None,
+    ) -> dict[str, torch.Tensor | Any] | None:
         """Recv the input tensor dictionary.
         NOTE: `src` is the local rank of the source rank.
 
@@ -943,7 +942,7 @@ class GroupCoordinator:
         """
         torch.distributed.barrier(group=self.cpu_group)
 
-    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+    def send(self, tensor: torch.Tensor, dst: int | None = None) -> None:
         """Sends a tensor to the destination rank in a blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
         if self.device_communicator is None:
@@ -951,7 +950,7 @@ class GroupCoordinator:
         self.device_communicator.send(tensor, dst)
 
     def recv(
-        self, size: torch.Size, dtype: torch.dtype, src: Optional[int] = None
+        self, size: torch.Size, dtype: torch.dtype, src: int | None = None
     ) -> torch.Tensor:
         """Receives a tensor from the source rank."""
         """NOTE: `src` is the local rank of the source rank."""
@@ -997,8 +996,8 @@ class GroupCoordinator:
             return hidden_states
 
 
-_WORLD: Optional[GroupCoordinator] = None
-_NODE_COUNT: Optional[int] = None
+_WORLD: GroupCoordinator | None = None
+_NODE_COUNT: int | None = None
 
 
 def get_world_group() -> GroupCoordinator:
@@ -1023,7 +1022,7 @@ def init_model_parallel_group(
     local_rank: int,
     backend: str,
     use_message_queue_broadcaster: bool = False,
-    group_name: Optional[str] = None,
+    group_name: str | None = None,
 ) -> GroupCoordinator:
     return GroupCoordinator(
         group_ranks=group_ranks,
@@ -1035,7 +1034,7 @@ def init_model_parallel_group(
     )
 
 
-_TP: Optional[GroupCoordinator] = None
+_TP: GroupCoordinator | None = None
 
 
 def get_tp_group() -> GroupCoordinator:
@@ -1052,7 +1051,7 @@ def get_tensor_model_parallel_group():
     return get_tp_group()
 
 
-_DCP: Optional[GroupCoordinator] = None
+_DCP: GroupCoordinator | None = None
 
 
 def get_dcp_group() -> GroupCoordinator:
@@ -1063,9 +1062,9 @@ def get_dcp_group() -> GroupCoordinator:
 # kept for backward compatibility
 get_context_model_parallel_group = get_dcp_group
 
-_PP: Optional[GroupCoordinator] = None
+_PP: GroupCoordinator | None = None
 
-_DP: Optional[GroupCoordinator] = None
+_DP: GroupCoordinator | None = None
 
 
 def get_dp_group() -> GroupCoordinator:
@@ -1073,7 +1072,7 @@ def get_dp_group() -> GroupCoordinator:
     return _DP
 
 
-_EP: Optional[GroupCoordinator] = None
+_EP: GroupCoordinator | None = None
 
 
 def get_ep_group() -> GroupCoordinator:
@@ -1131,7 +1130,7 @@ def init_distributed_environment(
     distributed_init_method: str = "env://",
     local_rank: int = -1,
     backend: str = "nccl",
-    timeout: Optional[timedelta] = None,
+    timeout: timedelta | None = None,
 ):
     logger.debug(
         "world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s",
@@ -1208,8 +1207,8 @@ def init_distributed_environment(
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
-    decode_context_model_parallel_size: Optional[int] = 1,
-    backend: Optional[str] = None,
+    decode_context_model_parallel_size: int | None = 1,
+    backend: str | None = None,
 ) -> None:
     """
     Initialize model parallel groups.
@@ -1338,8 +1337,8 @@ def initialize_model_parallel(
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
-    decode_context_model_parallel_size: Optional[int] = 1,
-    backend: Optional[str] = None,
+    decode_context_model_parallel_size: int | None = 1,
+    backend: str | None = None,
 ) -> None:
     """Helper to initialize model parallel groups if they are not initialized,
     or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
@@ -1504,7 +1503,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
 
 
 def in_the_same_node_as(
-    pg: Union[ProcessGroup, StatelessProcessGroup], source_rank: int = 0
+    pg: ProcessGroup | StatelessProcessGroup, source_rank: int = 0
 ) -> list[bool]:
     """
     This is a collective operation that returns if each rank is in the same node
@@ -1624,7 +1623,7 @@ def is_global_first_rank() -> bool:
         return True
 
 
-def _node_count(pg: Union[ProcessGroup, StatelessProcessGroup]) -> int:
+def _node_count(pg: ProcessGroup | StatelessProcessGroup) -> int:
     """
     Returns the total number of nodes in the process group.
 
diff --git a/vllm/distributed/tpu_distributed_utils.py b/vllm/distributed/tpu_distributed_utils.py
index 3db25d1a19641..4ff1f0ce4410a 100644
--- a/vllm/distributed/tpu_distributed_utils.py
+++ b/vllm/distributed/tpu_distributed_utils.py
@@ -30,9 +30,9 @@ class XlaQKVParallelLinear(nn.Module):
         self.q_weight: Parameter
         self.k_weight: Parameter
         self.v_weight: Parameter
-        self.q_bias: Optional[Parameter]
-        self.k_bias: Optional[Parameter]
-        self.v_bias: Optional[Parameter]
+        self.q_bias: Parameter | None
+        self.k_bias: Parameter | None
+        self.v_bias: Parameter | None
         self._load_weights_from_qkv_linear(qkv_linear)
         if mesh is not None:
             self._shard_weight(mesh)
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index a35f28c25385a..0a1e04ec10f99 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -15,7 +15,7 @@ import uuid
 from collections import deque
 from collections.abc import Sequence
 from datetime import timedelta
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch.distributed import ProcessGroup, TCPStore
@@ -150,7 +150,7 @@ class StatelessProcessGroup:
     store: torch._C._distributed_c10d.Store
 
     # stores a reference to the socket so that the file descriptor stays alive
-    socket: Optional[socket.socket]
+    socket: socket.socket | None
 
     data_expiration_seconds: int = 3600  # 1 hour
 
@@ -197,7 +197,7 @@ class StatelessProcessGroup:
         self.recv_src_counter[src] += 1
         return obj
 
-    def broadcast_obj(self, obj: Optional[Any], src: int) -> Any:
+    def broadcast_obj(self, obj: Any | None, src: int) -> Any:
         """Broadcast an object from a source rank to all other ranks.
         It does not clean up after all ranks have received the object.
         Use it for limited times, e.g., for initialization.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cb47e439fc733..54a0539f40479 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -7,15 +7,16 @@ import dataclasses
 import functools
 import json
 import sys
+from collections.abc import Callable
 from dataclasses import MISSING, dataclass, fields, is_dataclass
 from itertools import permutations
+from types import UnionType
 from typing import (
     TYPE_CHECKING,
     Annotated,
     Any,
-    Callable,
     Literal,
-    Optional,
+    TypeAlias,
     TypeVar,
     Union,
     cast,
@@ -98,8 +99,8 @@ logger = init_logger(__name__)
 
 # object is used to allow for special typing forms
 T = TypeVar("T")
-TypeHint = Union[type[Any], object]
-TypeHintT = Union[type[T], object]
+TypeHint: TypeAlias = type[Any] | object
+TypeHintT: TypeAlias = type[T] | object
 
 
 def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
@@ -114,8 +115,8 @@ def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
     return _parse_type
 
 
-def optional_type(return_type: Callable[[str], T]) -> Callable[[str], Optional[T]]:
-    def _optional_type(val: str) -> Optional[T]:
+def optional_type(return_type: Callable[[str], T]) -> Callable[[str], T | None]:
+    def _optional_type(val: str) -> T | None:
         if val == "" or val == "None":
             return None
         return parse_type(return_type)(val)
@@ -123,7 +124,7 @@ def optional_type(return_type: Callable[[str], T]) -> Callable[[str], Optional[T
     return _optional_type
 
 
-def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]:
+def union_dict_and_str(val: str) -> str | dict[str, str] | None:
     if not re.match(r"(?s)^\s*{.*}\s*$", val):
         return str(val)
     return optional_type(json.loads)(val)
@@ -174,7 +175,8 @@ def get_type_hints(type_hint: TypeHint) -> set[TypeHint]:
 
     if origin is Annotated:
         type_hints.update(get_type_hints(args[0]))
-    elif origin is Union:
+    elif origin in {Union, UnionType}:
+        # Union for Union[X, Y] and UnionType for X | Y
         for arg in args:
             type_hints.update(get_type_hints(arg))
     else:
@@ -195,7 +197,7 @@ NEEDS_HELP = (
 
 
 @functools.lru_cache(maxsize=30)
-def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
+def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
     # Save time only getting attr docs if we're generating help text
     cls_docs = get_attr_docs(cls) if NEEDS_HELP else {}
     kwargs = {}
@@ -262,7 +264,8 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
             type_hint = get_type(type_hints, list)
             types = get_args(type_hint)
             list_type = types[0]
-            if get_origin(list_type) is Union:
+            if get_origin(list_type) in {Union, UnionType}:
+                # Union for Union[X, Y] and UnionType for X | Y
                 msg = "List type must contain str if it is a Union."
                 assert str in get_args(list_type), msg
                 list_type = str
@@ -310,7 +313,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
     return kwargs
 
 
-def get_kwargs(cls: ConfigType) -> dict[str, Any]:
+def get_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
     """Return argparse kwargs for the given Config dataclass.
 
     If `--help` or `mkdocs` are not present in the command line command, the
@@ -328,43 +331,43 @@ class EngineArgs:
     """Arguments for vLLM engine."""
 
     model: str = ModelConfig.model
-    served_model_name: Optional[Union[str, list[str]]] = ModelConfig.served_model_name
-    tokenizer: Optional[str] = ModelConfig.tokenizer
-    hf_config_path: Optional[str] = ModelConfig.hf_config_path
+    served_model_name: str | list[str] | None = ModelConfig.served_model_name
+    tokenizer: str | None = ModelConfig.tokenizer
+    hf_config_path: str | None = ModelConfig.hf_config_path
     runner: RunnerOption = ModelConfig.runner
     convert: ConvertOption = ModelConfig.convert
-    task: Optional[TaskOption] = ModelConfig.task
+    task: TaskOption | None = ModelConfig.task
     skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
     enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
     tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
     trust_remote_code: bool = ModelConfig.trust_remote_code
     allowed_local_media_path: str = ModelConfig.allowed_local_media_path
-    allowed_media_domains: Optional[list[str]] = ModelConfig.allowed_media_domains
-    download_dir: Optional[str] = LoadConfig.download_dir
+    allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
+    download_dir: str | None = LoadConfig.download_dir
     safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
-    load_format: Union[str, LoadFormats] = LoadConfig.load_format
+    load_format: str | LoadFormats = LoadConfig.load_format
     config_format: str = ModelConfig.config_format
     dtype: ModelDType = ModelConfig.dtype
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
-    seed: Optional[int] = ModelConfig.seed
-    max_model_len: Optional[int] = ModelConfig.max_model_len
+    seed: int | None = ModelConfig.seed
+    max_model_len: int | None = ModelConfig.max_model_len
     cuda_graph_sizes: list[int] = get_field(SchedulerConfig, "cuda_graph_sizes")
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
     # notice.
-    distributed_executor_backend: Optional[
-        Union[str, DistributedExecutorBackend, type[ExecutorBase]]
-    ] = ParallelConfig.distributed_executor_backend
+    distributed_executor_backend: (
+        str | DistributedExecutorBackend | type[ExecutorBase] | None
+    ) = ParallelConfig.distributed_executor_backend
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
     tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
     decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size
     data_parallel_size: int = ParallelConfig.data_parallel_size
-    data_parallel_rank: Optional[int] = None
-    data_parallel_start_rank: Optional[int] = None
-    data_parallel_size_local: Optional[int] = None
-    data_parallel_address: Optional[str] = None
-    data_parallel_rpc_port: Optional[int] = None
+    data_parallel_rank: int | None = None
+    data_parallel_start_rank: int | None = None
+    data_parallel_size_local: int | None = None
+    data_parallel_address: str | None = None
+    data_parallel_rpc_port: int | None = None
     data_parallel_hybrid_lb: bool = False
     data_parallel_backend: str = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
@@ -385,11 +388,11 @@ class EngineArgs:
     eplb_window_size: int = EPLBConfig.window_size
     eplb_step_interval: int = EPLBConfig.step_interval
     eplb_log_balancedness: bool = EPLBConfig.log_balancedness
-    max_parallel_loading_workers: Optional[int] = (
+    max_parallel_loading_workers: int | None = (
         ParallelConfig.max_parallel_loading_workers
     )
-    block_size: Optional[BlockSize] = CacheConfig.block_size
-    enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
+    block_size: BlockSize | None = CacheConfig.block_size
+    enable_prefix_caching: bool | None = CacheConfig.enable_prefix_caching
     prefix_caching_hash_algo: PrefixCachingHashAlgo = (
         CacheConfig.prefix_caching_hash_algo
     )
@@ -398,62 +401,62 @@ class EngineArgs:
     swap_space: float = CacheConfig.swap_space
     cpu_offload_gb: float = CacheConfig.cpu_offload_gb
     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
-    kv_cache_memory_bytes: Optional[int] = CacheConfig.kv_cache_memory_bytes
-    max_num_batched_tokens: Optional[int] = SchedulerConfig.max_num_batched_tokens
+    kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
+    max_num_batched_tokens: int | None = SchedulerConfig.max_num_batched_tokens
     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
     max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
     long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold
-    max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
+    max_num_seqs: int | None = SchedulerConfig.max_num_seqs
     max_logprobs: int = ModelConfig.max_logprobs
     logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
     disable_log_stats: bool = False
-    revision: Optional[str] = ModelConfig.revision
-    code_revision: Optional[str] = ModelConfig.code_revision
+    revision: str | None = ModelConfig.revision
+    code_revision: str | None = ModelConfig.code_revision
     rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling")
-    rope_theta: Optional[float] = ModelConfig.rope_theta
-    hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token
+    rope_theta: float | None = ModelConfig.rope_theta
+    hf_token: bool | str | None = ModelConfig.hf_token
     hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
-    tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision
-    quantization: Optional[QuantizationMethods] = ModelConfig.quantization
+    tokenizer_revision: str | None = ModelConfig.tokenizer_revision
+    quantization: QuantizationMethods | None = ModelConfig.quantization
     enforce_eager: bool = ModelConfig.enforce_eager
     disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
-    limit_mm_per_prompt: dict[str, Union[int, dict[str, int]]] = get_field(
+    limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field(
         MultiModalConfig, "limit_per_prompt"
     )
     interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
     media_io_kwargs: dict[str, dict[str, Any]] = get_field(
         MultiModalConfig, "media_io_kwargs"
     )
-    mm_processor_kwargs: Optional[dict[str, Any]] = MultiModalConfig.mm_processor_kwargs
+    mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
     mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
-    mm_processor_cache_type: Optional[MMCacheType] = (
+    mm_processor_cache_type: MMCacheType | None = (
         MultiModalConfig.mm_processor_cache_type
     )
     mm_shm_cache_max_object_size_mb: int = (
         MultiModalConfig.mm_shm_cache_max_object_size_mb
     )
     mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
-    io_processor_plugin: Optional[str] = None
+    io_processor_plugin: str | None = None
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     video_pruning_rate: float = MultiModalConfig.video_pruning_rate
     # LoRA fields
     enable_lora: bool = False
     max_loras: int = LoRAConfig.max_loras
     max_lora_rank: int = LoRAConfig.max_lora_rank
-    default_mm_loras: Optional[dict[str, str]] = LoRAConfig.default_mm_loras
+    default_mm_loras: dict[str, str] | None = LoRAConfig.default_mm_loras
     fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
-    max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
-    lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
+    max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
+    lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
     lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
 
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
-    num_gpu_blocks_override: Optional[int] = CacheConfig.num_gpu_blocks_override
+    num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
     num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
     model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
-    ignore_patterns: Union[str, list[str]] = get_field(LoadConfig, "ignore_patterns")
+    ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
 
-    enable_chunked_prefill: Optional[bool] = SchedulerConfig.enable_chunked_prefill
+    enable_chunked_prefill: bool | None = SchedulerConfig.enable_chunked_prefill
     disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
     disable_hybrid_kv_cache_manager: bool = (
@@ -465,35 +468,35 @@ class EngineArgs:
     )
     reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
     # Deprecated guided decoding fields
-    guided_decoding_backend: Optional[str] = None
-    guided_decoding_disable_fallback: Optional[bool] = None
-    guided_decoding_disable_any_whitespace: Optional[bool] = None
-    guided_decoding_disable_additional_properties: Optional[bool] = None
+    guided_decoding_backend: str | None = None
+    guided_decoding_disable_fallback: bool | None = None
+    guided_decoding_disable_any_whitespace: bool | None = None
+    guided_decoding_disable_additional_properties: bool | None = None
 
-    logits_processor_pattern: Optional[str] = ModelConfig.logits_processor_pattern
+    logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern
 
-    speculative_config: Optional[dict[str, Any]] = None
+    speculative_config: dict[str, Any] | None = None
 
-    show_hidden_metrics_for_version: Optional[str] = (
+    show_hidden_metrics_for_version: str | None = (
         ObservabilityConfig.show_hidden_metrics_for_version
     )
-    otlp_traces_endpoint: Optional[str] = ObservabilityConfig.otlp_traces_endpoint
-    collect_detailed_traces: Optional[list[DetailedTraceModules]] = (
+    otlp_traces_endpoint: str | None = ObservabilityConfig.otlp_traces_endpoint
+    collect_detailed_traces: list[DetailedTraceModules] | None = (
         ObservabilityConfig.collect_detailed_traces
     )
     scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
-    scheduler_cls: Union[str, type[object]] = SchedulerConfig.scheduler_cls
+    scheduler_cls: str | type[object] = SchedulerConfig.scheduler_cls
 
-    pooler_config: Optional[PoolerConfig] = ModelConfig.pooler_config
-    override_pooler_config: Optional[Union[dict, PoolerConfig]] = (
+    pooler_config: PoolerConfig | None = ModelConfig.pooler_config
+    override_pooler_config: dict | PoolerConfig | None = (
         ModelConfig.override_pooler_config
     )
     compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
     worker_cls: str = ParallelConfig.worker_cls
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
 
-    kv_transfer_config: Optional[KVTransferConfig] = None
-    kv_events_config: Optional[KVEventsConfig] = None
+    kv_transfer_config: KVTransferConfig | None = None
+    kv_events_config: KVEventsConfig | None = None
 
     generation_config: str = ModelConfig.generation_config
     enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
@@ -515,7 +518,7 @@ class EngineArgs:
     # DEPRECATED
     enable_multimodal_encoder_data_parallel: bool = False
 
-    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = (
+    logits_processors: list[str | type[LogitsProcessor]] | None = (
         ModelConfig.logits_processors
     )
     """Custom logitproc types"""
@@ -1187,7 +1190,7 @@ class EngineArgs:
         target_parallel_config: ParallelConfig,
         enable_chunked_prefill: bool,
         disable_log_stats: bool,
-    ) -> Optional["SpeculativeConfig"]:
+    ) -> SpeculativeConfig | None:
         """Initializes and returns a SpeculativeConfig object based on
         `speculative_config`.
 
@@ -1214,7 +1217,7 @@ class EngineArgs:
 
     def create_engine_config(
         self,
-        usage_context: Optional[UsageContext] = None,
+        usage_context: UsageContext | None = None,
         headless: bool = False,
     ) -> VllmConfig:
         """
@@ -1282,7 +1285,7 @@ class EngineArgs:
             self.enable_chunked_prefill = False
         assert self.enable_chunked_prefill is not None
 
-        sliding_window: Optional[int] = None
+        sliding_window: int | None = None
         if not is_interleaved(model_config.hf_text_config):
             # Only set CacheConfig.sliding_window if the model is all sliding
             # window. Otherwise CacheConfig.sliding_window will override the
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 45b798ed96cb2..64f1961dd849e 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -3,7 +3,7 @@
 
 import time
 from collections import Counter as CollectionsCounter
-from typing import Optional, Union, cast
+from typing import cast
 
 import numpy as np
 import prometheus_client
@@ -304,7 +304,7 @@ class _RayGaugeWrapper:
         self,
         name: str,
         documentation: str = "",
-        labelnames: Optional[list[str]] = None,
+        labelnames: list[str] | None = None,
         multiprocess_mode: str = "",
     ):
         del multiprocess_mode
@@ -317,7 +317,7 @@ class _RayGaugeWrapper:
         self._gauge.set_default_tags(labels)
         return self
 
-    def set(self, value: Union[int, float]):
+    def set(self, value: int | float):
         return self._gauge.set(value)
 
     def set_to_current_time(self):
@@ -330,7 +330,7 @@ class _RayCounterWrapper:
     prometheus_client.Counter"""
 
     def __init__(
-        self, name: str, documentation: str = "", labelnames: Optional[list[str]] = None
+        self, name: str, documentation: str = "", labelnames: list[str] | None = None
     ):
         labelnames_tuple = tuple(labelnames) if labelnames else None
         self._counter = ray_metrics.Counter(
@@ -341,7 +341,7 @@ class _RayCounterWrapper:
         self._counter.set_default_tags(labels)
         return self
 
-    def inc(self, value: Union[int, float] = 1.0):
+    def inc(self, value: int | float = 1.0):
         if value == 0:
             return
         return self._counter.inc(value)
@@ -355,8 +355,8 @@ class _RayHistogramWrapper:
         self,
         name: str,
         documentation: str = "",
-        labelnames: Optional[list[str]] = None,
-        buckets: Optional[list[float]] = None,
+        labelnames: list[str] | None = None,
+        buckets: list[float] | None = None,
     ):
         labelnames_tuple = tuple(labelnames) if labelnames else None
         boundaries = buckets if buckets else []
@@ -371,7 +371,7 @@ class _RayHistogramWrapper:
         self._histogram.set_default_tags(labels)
         return self
 
-    def observe(self, value: Union[int, float]):
+    def observe(self, value: int | float):
         return self._histogram.observe(value)
 
 
@@ -451,8 +451,8 @@ class LoggingStatLogger(StatLoggerBase):
 
     def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
         super().__init__(local_interval, vllm_config)
-        self.last_prompt_throughput: Optional[float] = None
-        self.last_generation_throughput: Optional[float] = None
+        self.last_prompt_throughput: float | None = None
+        self.last_generation_throughput: float | None = None
 
     def log(self, stats: Stats) -> None:
         """Called by LLMEngine.
@@ -539,11 +539,11 @@ class PrometheusStatLogger(StatLoggerBase):
             labelnames=list(labels.keys()), vllm_config=vllm_config
         )
 
-    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
+    def _log_gauge(self, gauge, data: int | float) -> None:
         # Convenience function for logging to gauge.
         gauge.labels(**self.labels).set(data)
 
-    def _log_counter(self, counter, data: Union[int, float]) -> None:
+    def _log_counter(self, counter, data: int | float) -> None:
         # Convenience function for logging to counter.
         # Prevent ValueError from negative increment
         if data < 0:
@@ -558,7 +558,7 @@ class PrometheusStatLogger(StatLoggerBase):
         for label, count in data.items():
             counter.labels(**{**self.labels, label_key: label}).inc(count)
 
-    def _log_histogram(self, histogram, data: Union[list[int], list[float]]) -> None:
+    def _log_histogram(self, histogram, data: list[int] | list[float]) -> None:
         # Convenience function for logging list to histogram.
         for datum in data:
             histogram.labels(**self.labels).observe(datum)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index e7d957d7b684e..870676346b75b 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -3,7 +3,7 @@
 
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Iterable, Mapping
-from typing import Any, Optional, Union
+from typing import Any
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs.data import PromptType
@@ -28,7 +28,7 @@ class EngineClient(ABC):
     vllm_config: VllmConfig
     model_config: ModelConfig
     processor: Processor
-    io_processor: Optional[IOProcessor]
+    io_processor: IOProcessor | None
 
     @property
     @abstractmethod
@@ -49,16 +49,16 @@ class EngineClient(ABC):
     @abstractmethod
     def generate(
         self,
-        prompt: Union[EngineCoreRequest, PromptType],
+        prompt: EngineCoreRequest | PromptType,
         sampling_params: SamplingParams,
         request_id: str,
         *,
-        prompt_text: Optional[str] = None,
-        lora_request: Optional[LoRARequest] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_text: str | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
+        data_parallel_rank: int | None = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """Generate outputs for a request."""
         ...
@@ -69,16 +69,16 @@ class EngineClient(ABC):
         prompt: PromptType,
         pooling_params: PoolingParams,
         request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        lora_request: LoRARequest | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """Generate outputs for a request from a pooling model."""
         ...
 
     @abstractmethod
-    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
+    async def abort(self, request_id: str | Iterable[str]) -> None:
         """Abort a request.
 
         Args:
@@ -119,7 +119,7 @@ class EngineClient(ABC):
         ...
 
     @abstractmethod
-    async def reset_prefix_cache(self, device: Optional[Device] = None) -> None:
+    async def reset_prefix_cache(self, device: Device | None = None) -> None:
         """Reset the prefix cache"""
         ...
 
@@ -129,7 +129,7 @@ class EngineClient(ABC):
         ...
 
     @abstractmethod
-    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    async def wake_up(self, tags: list[str] | None = None) -> None:
         """Wake up the engine"""
         ...
 
@@ -152,9 +152,9 @@ class EngineClient(ABC):
     async def collective_rpc(
         self,
         method: str,
-        timeout: Optional[float] = None,
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict] = None,
+        kwargs: dict | None = None,
     ):
         """Perform a collective RPC call to the given path."""
         raise NotImplementedError
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index c31d15ddac4f5..53dab90f45f77 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -13,7 +13,7 @@ import json
 import ssl
 from argparse import Namespace
 from collections.abc import AsyncGenerator
-from typing import Any, Optional
+from typing import Any
 
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -101,7 +101,7 @@ def build_app(args: Namespace) -> FastAPI:
 
 async def init_app(
     args: Namespace,
-    llm_engine: Optional[AsyncLLMEngine] = None,
+    llm_engine: AsyncLLMEngine | None = None,
 ) -> FastAPI:
     app = build_app(args)
 
@@ -120,7 +120,7 @@ async def init_app(
 
 
 async def run_server(
-    args: Namespace, llm_engine: Optional[AsyncLLMEngine] = None, **uvicorn_kwargs: Any
+    args: Namespace, llm_engine: AsyncLLMEngine | None = None, **uvicorn_kwargs: Any
 ) -> None:
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index e548554dca734..21973018a2b64 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -5,10 +5,10 @@ import asyncio
 import json
 from abc import ABC, abstractmethod
 from collections import Counter, defaultdict, deque
-from collections.abc import Awaitable, Iterable
+from collections.abc import Awaitable, Callable, Iterable
 from functools import cached_property, lru_cache, partial
 from pathlib import Path
-from typing import Any, Callable, Generic, Literal, Optional, TypeVar, Union, cast
+from typing import Any, Generic, Literal, TypeAlias, TypeVar, cast
 
 import jinja2
 import jinja2.ext
@@ -40,7 +40,7 @@ from pydantic import BaseModel, ConfigDict, TypeAdapter
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin
 
 # pydantic needs the TypedDict from typing_extensions
-from typing_extensions import Required, TypeAlias, TypedDict
+from typing_extensions import Required, TypedDict
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
@@ -76,7 +76,7 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
 
 
 class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
-    image_embeds: Optional[Union[str, dict[str, str]]]
+    image_embeds: str | dict[str, str] | None
     """
     The image embeddings. It can be either:
     - A single base64 string.
@@ -84,7 +84,7 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
     """
     type: Required[Literal["image_embeds"]]
     """The type of the content part."""
-    uuid: Optional[str]
+    uuid: str | None
     """
     User-provided UUID of a media. User must guarantee that it is properly
     generated and unique for different medias.
@@ -123,8 +123,8 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
     }
     """
 
-    image_pil: Optional[PILImage]
-    uuid: Optional[str]
+    image_pil: PILImage | None
+    uuid: str | None
     """
     User-provided UUID of a media. User must guarantee that it is properly
     generated and unique for different medias.
@@ -141,8 +141,8 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
     }
     """
 
-    image_url: Optional[str]
-    uuid: Optional[str]
+    image_url: str | None
+    uuid: str | None
     """
     User-provided UUID of a media. User must guarantee that it is properly
     generated and unique for different medias.
@@ -158,7 +158,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
     }
     """
 
-    audio_url: Optional[str]
+    audio_url: str | None
 
 
 class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
@@ -170,8 +170,8 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     }
     """
 
-    video_url: Optional[str]
-    uuid: Optional[str]
+    video_url: str | None
+    uuid: str | None
     """
     User-provided UUID of a media. User must guarantee that it is properly
     generated and unique for different medias.
@@ -199,20 +199,20 @@ class CustomThinkCompletionContentParam(TypedDict, total=False):
     """The thinking type."""
 
 
-ChatCompletionContentPartParam: TypeAlias = Union[
-    OpenAIChatCompletionContentPartParam,
-    ChatCompletionContentPartAudioParam,
-    ChatCompletionContentPartInputAudioParam,
-    ChatCompletionContentPartVideoParam,
-    ChatCompletionContentPartRefusalParam,
-    CustomChatCompletionContentPILImageParam,
-    CustomChatCompletionContentSimpleImageParam,
-    ChatCompletionContentPartImageEmbedsParam,
-    CustomChatCompletionContentSimpleAudioParam,
-    CustomChatCompletionContentSimpleVideoParam,
-    str,
-    CustomThinkCompletionContentParam,
-]
+ChatCompletionContentPartParam: TypeAlias = (
+    OpenAIChatCompletionContentPartParam
+    | ChatCompletionContentPartAudioParam
+    | ChatCompletionContentPartInputAudioParam
+    | ChatCompletionContentPartVideoParam
+    | ChatCompletionContentPartRefusalParam
+    | CustomChatCompletionContentPILImageParam
+    | CustomChatCompletionContentSimpleImageParam
+    | ChatCompletionContentPartImageEmbedsParam
+    | CustomChatCompletionContentSimpleAudioParam
+    | CustomChatCompletionContentSimpleVideoParam
+    | str
+    | CustomThinkCompletionContentParam
+)
 
 
 class CustomChatCompletionMessageParam(TypedDict, total=False):
@@ -221,7 +221,7 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
     role: Required[str]
     """The role of the message's author."""
 
-    content: Union[str, list[ChatCompletionContentPartParam]]
+    content: str | list[ChatCompletionContentPartParam]
     """The contents of the message."""
 
     name: str
@@ -231,18 +231,18 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
     same role.
     """
 
-    tool_call_id: Optional[str]
+    tool_call_id: str | None
     """Tool call that this message is responding to."""
 
-    tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
+    tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
     """The tool calls generated by the model, such as function calls."""
 
 
-ChatCompletionMessageParam = Union[
-    OpenAIChatCompletionMessageParam,
-    CustomChatCompletionMessageParam,
-    OpenAIHarmonyMessage,
-]
+ChatCompletionMessageParam: TypeAlias = (
+    OpenAIChatCompletionMessageParam
+    | CustomChatCompletionMessageParam
+    | OpenAIHarmonyMessage
+)
 
 
 # TODO: Make fields ReadOnly once mypy supports it
@@ -250,16 +250,16 @@ class ConversationMessage(TypedDict, total=False):
     role: Required[str]
     """The role of the message's author."""
 
-    content: Union[Optional[str], list[dict[str, str]]]
+    content: str | None | list[dict[str, str]]
     """The contents of the message"""
 
-    tool_call_id: Optional[str]
+    tool_call_id: str | None
     """Tool call that this message is responding to."""
 
-    name: Optional[str]
+    name: str | None
     """The name of the function to call"""
 
-    tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
+    tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
     """The tool calls generated by the model, such as function calls."""
 
 
@@ -294,7 +294,7 @@ def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
 def _is_var_or_elems_access(
     node: jinja2.nodes.Node,
     varname: str,
-    key: Optional[str] = None,
+    key: str | None = None,
 ) -> bool:
     if isinstance(node, jinja2.nodes.Filter):
         return node.node is not None and _is_var_or_elems_access(
@@ -369,7 +369,7 @@ def _iter_nodes_assign_content_item(root: jinja2.nodes.Node):
                 break
 
 
-def _try_extract_ast(chat_template: str) -> Optional[jinja2.nodes.Template]:
+def _try_extract_ast(chat_template: str) -> jinja2.nodes.Template | None:
     try:
         jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
         return jinja_compiled.environment.parse(chat_template)
@@ -400,9 +400,9 @@ def _detect_content_format(
 
 
 def resolve_mistral_chat_template(
-    chat_template: Optional[str],
+    chat_template: str | None,
     **kwargs: Any,
-) -> Optional[str]:
+) -> str | None:
     if chat_template is not None or kwargs.get("chat_template_kwargs") is not None:
         raise ValueError(
             "'chat_template' or 'chat_template_kwargs' cannot be overridden "
@@ -412,7 +412,7 @@ def resolve_mistral_chat_template(
     return None
 
 
-_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], Optional[str]]()
+_PROCESSOR_CHAT_TEMPLATES = dict[tuple[str, bool], str | None]()
 """
 Used in `_try_get_processor_chat_template` to avoid calling
 `cached_get_processor` again if the processor fails to be loaded.
@@ -422,9 +422,9 @@ This is needed because `lru_cache` does not cache when an exception happens.
 
 
 def _try_get_processor_chat_template(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
     model_config: ModelConfig,
-) -> Optional[str]:
+) -> str | None:
     cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
     if cache_key in _PROCESSOR_CHAT_TEMPLATES:
         return _PROCESSOR_CHAT_TEMPLATES[cache_key]
@@ -458,12 +458,12 @@ def _try_get_processor_chat_template(
 
 
 def resolve_hf_chat_template(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    chat_template: Optional[str],
-    tools: Optional[list[dict[str, Any]]],
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
     *,
     model_config: ModelConfig,
-) -> Optional[str]:
+) -> str | None:
     # 1st priority: The given chat template
     if chat_template is not None:
         return chat_template
@@ -505,8 +505,8 @@ def resolve_hf_chat_template(
 
 
 def _resolve_chat_template_content_format(
-    chat_template: Optional[str],
-    tools: Optional[list[dict[str, Any]]],
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
     tokenizer: AnyTokenizer,
     *,
     model_config: ModelConfig,
@@ -538,7 +538,7 @@ def _resolve_chat_template_content_format(
 
 @lru_cache
 def _log_chat_template_content_format(
-    chat_template: Optional[str],
+    chat_template: str | None,
     given_format: ChatTemplateContentFormatOption,
     detected_format: ChatTemplateContentFormatOption,
 ):
@@ -561,8 +561,8 @@ def _log_chat_template_content_format(
 
 
 def resolve_chat_template_content_format(
-    chat_template: Optional[str],
-    tools: Optional[list[dict[str, Any]]],
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
     given_format: ChatTemplateContentFormatOption,
     tokenizer: AnyTokenizer,
     *,
@@ -604,8 +604,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
         self._model_config = model_config
         self._tokenizer = tokenizer
 
-        self._items_by_modality = defaultdict[str, list[Optional[_T]]](list)
-        self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list)
+        self._items_by_modality = defaultdict[str, list[_T | None]](list)
+        self._uuids_by_modality = defaultdict[str, list[str | None]](list)
 
     @property
     def model_config(self) -> ModelConfig:
@@ -637,9 +637,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     def add(
         self,
         modality: ModalityStr,
-        item: Optional[_T],
-        uuid: Optional[str] = None,
-    ) -> Optional[str]:
+        item: _T | None,
+        uuid: str | None = None,
+    ) -> str | None:
         """
         Add a multi-modal item to the current prompt and returns the
         placeholder string to use, if any.
@@ -657,7 +657,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
         return self.model_cls.get_placeholder_str(modality, num_items)
 
-    def all_mm_uuids(self) -> Optional[MultiModalUUIDDict]:
+    def all_mm_uuids(self) -> MultiModalUUIDDict | None:
         if not self._items_by_modality:
             return None
         mm_uuids = {}
@@ -684,7 +684,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
 
 
 class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
-    def all_mm_data(self) -> Optional[MultiModalDataDict]:
+    def all_mm_data(self) -> MultiModalDataDict | None:
         if not self._items_by_modality:
             return None
         mm_inputs = {}
@@ -710,7 +710,7 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
 
 
 class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
-    async def all_mm_data(self) -> Optional[MultiModalDataDict]:
+    async def all_mm_data(self) -> MultiModalDataDict | None:
         if not self._items_by_modality:
             return None
         mm_inputs = {}
@@ -756,7 +756,7 @@ class BaseMultiModalContentParser(ABC):
         # }
         self._placeholder_storage: dict[str, list] = defaultdict(list)
 
-    def _add_placeholder(self, modality: ModalityStr, placeholder: Optional[str]):
+    def _add_placeholder(self, modality: ModalityStr, placeholder: str | None):
         mod_placeholder = MODALITY_PLACEHOLDERS_MAP[modality]
         if placeholder:
             self._placeholder_storage[mod_placeholder].append(placeholder)
@@ -765,35 +765,35 @@ class BaseMultiModalContentParser(ABC):
         return dict(self._placeholder_storage)
 
     @abstractmethod
-    def parse_image(self, image_url: Optional[str], uuid: Optional[str] = None) -> None:
+    def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         raise NotImplementedError
 
     @abstractmethod
     def parse_image_embeds(
         self,
-        image_embeds: Union[str, dict[str, str], None],
-        uuid: Optional[str] = None,
+        image_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
     ) -> None:
         raise NotImplementedError
 
     @abstractmethod
     def parse_image_pil(
-        self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
+        self, image_pil: Image.Image | None, uuid: str | None = None
     ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_audio(self, audio_url: Optional[str], uuid: Optional[str] = None) -> None:
+    def parse_audio(self, audio_url: str | None, uuid: str | None = None) -> None:
         raise NotImplementedError
 
     @abstractmethod
     def parse_input_audio(
-        self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
+        self, input_audio: InputAudio | None, uuid: str | None = None
     ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_video(self, video_url: Optional[str], uuid: Optional[str] = None) -> None:
+    def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
         raise NotImplementedError
 
 
@@ -810,7 +810,7 @@ class MultiModalContentParser(BaseMultiModalContentParser):
             allowed_media_domains=tracker.allowed_media_domains,
         )
 
-    def parse_image(self, image_url: Optional[str], uuid: Optional[str] = None) -> None:
+    def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image = self._connector.fetch_image(image_url) if image_url else None
 
         placeholder = self._tracker.add("image", image, uuid)
@@ -818,8 +818,8 @@ class MultiModalContentParser(BaseMultiModalContentParser):
 
     def parse_image_embeds(
         self,
-        image_embeds: Union[str, dict[str, str], None],
-        uuid: Optional[str] = None,
+        image_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
     ) -> None:
         if isinstance(image_embeds, dict):
             embeds = {
@@ -838,19 +838,19 @@ class MultiModalContentParser(BaseMultiModalContentParser):
         self._add_placeholder("image", placeholder)
 
     def parse_image_pil(
-        self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
+        self, image_pil: Image.Image | None, uuid: str | None = None
     ) -> None:
         placeholder = self._tracker.add("image", image_pil, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_audio(self, audio_url: Optional[str], uuid: Optional[str] = None) -> None:
+    def parse_audio(self, audio_url: str | None, uuid: str | None = None) -> None:
         audio = self._connector.fetch_audio(audio_url) if audio_url else None
 
         placeholder = self._tracker.add("audio", audio, uuid)
         self._add_placeholder("audio", placeholder)
 
     def parse_input_audio(
-        self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
+        self, input_audio: InputAudio | None, uuid: str | None = None
     ) -> None:
         if input_audio:
             audio_data = input_audio.get("data", "")
@@ -865,7 +865,7 @@ class MultiModalContentParser(BaseMultiModalContentParser):
 
         return self.parse_audio(audio_url, uuid)
 
-    def parse_video(self, video_url: Optional[str], uuid: Optional[str] = None) -> None:
+    def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
         video = self._connector.fetch_video(video_url=video_url) if video_url else None
 
         placeholder = self._tracker.add("video", video, uuid)
@@ -885,7 +885,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
             allowed_media_domains=tracker.allowed_media_domains,
         )
 
-    def parse_image(self, image_url: Optional[str], uuid: Optional[str] = None) -> None:
+    def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
         image_coro = self._connector.fetch_image_async(image_url) if image_url else None
 
         placeholder = self._tracker.add("image", image_coro, uuid)
@@ -893,10 +893,10 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
 
     def parse_image_embeds(
         self,
-        image_embeds: Union[str, dict[str, str], None],
-        uuid: Optional[str] = None,
+        image_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
     ) -> None:
-        future: asyncio.Future[Union[str, dict[str, str], None]] = asyncio.Future()
+        future: asyncio.Future[str | dict[str, str] | None] = asyncio.Future()
 
         if isinstance(image_embeds, dict):
             embeds = {
@@ -916,9 +916,9 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         self._add_placeholder("image", placeholder)
 
     def parse_image_pil(
-        self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
+        self, image_pil: Image.Image | None, uuid: str | None = None
     ) -> None:
-        future: asyncio.Future[Optional[Image.Image]] = asyncio.Future()
+        future: asyncio.Future[Image.Image | None] = asyncio.Future()
         if image_pil:
             future.set_result(image_pil)
         else:
@@ -927,14 +927,14 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         placeholder = self._tracker.add("image", future, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_audio(self, audio_url: Optional[str], uuid: Optional[str] = None) -> None:
+    def parse_audio(self, audio_url: str | None, uuid: str | None = None) -> None:
         audio_coro = self._connector.fetch_audio_async(audio_url) if audio_url else None
 
         placeholder = self._tracker.add("audio", audio_coro, uuid)
         self._add_placeholder("audio", placeholder)
 
     def parse_input_audio(
-        self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
+        self, input_audio: InputAudio | None, uuid: str | None = None
     ) -> None:
         if input_audio:
             audio_data = input_audio.get("data", "")
@@ -949,7 +949,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
 
         return self.parse_audio(audio_url, uuid)
 
-    def parse_video(self, video_url: Optional[str], uuid: Optional[str] = None) -> None:
+    def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
         video = (
             self._connector.fetch_video_async(video_url=video_url)
             if video_url
@@ -960,7 +960,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
         self._add_placeholder("video", placeholder)
 
 
-def validate_chat_template(chat_template: Optional[Union[Path, str]]):
+def validate_chat_template(chat_template: Path | str | None):
     """Raises if the provided chat template appears invalid."""
     if chat_template is None:
         return
@@ -984,10 +984,10 @@ def validate_chat_template(chat_template: Optional[Union[Path, str]]):
 
 
 def _load_chat_template(
-    chat_template: Optional[Union[Path, str]],
+    chat_template: Path | str | None,
     *,
     is_literal: bool = False,
-) -> Optional[str]:
+) -> str | None:
     if chat_template is None:
         return None
 
@@ -1024,10 +1024,10 @@ _cached_load_chat_template = lru_cache(_load_chat_template)
 
 
 def load_chat_template(
-    chat_template: Optional[Union[Path, str]],
+    chat_template: Path | str | None,
     *,
     is_literal: bool = False,
-) -> Optional[str]:
+) -> str | None:
     return _cached_load_chat_template(chat_template, is_literal=is_literal)
 
 
@@ -1107,7 +1107,7 @@ _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
 _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
 
 _ResponsesInputImageParser = TypeAdapter(ResponseInputImageParam).validate_python
-_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
+_ContentPart: TypeAlias = str | dict[str, str] | InputAudio | PILImage
 
 # Define a mapping from part types to their corresponding parsing functions.
 MM_PARSER_MAP: dict[
@@ -1264,7 +1264,7 @@ def _parse_chat_message_content_part(
     *,
     wrap_dicts: bool,
     interleave_strings: bool,
-) -> Optional[_ContentPart]:
+) -> _ContentPart | None:
     """Parses a single part of a conversation. If wrap_dicts is True,
     structured dictionary pieces for texts and images will be
     wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
@@ -1310,10 +1310,7 @@ def _parse_chat_message_content_part(
         mm_parser.parse_image(str_content, uuid)
         modality = "image"
     elif part_type == "image_embeds":
-        if content is not None:
-            content = cast(Union[str, dict[str, str]], content)
-        else:
-            content = None
+        content = cast(str | dict[str, str], content) if content is not None else None
         mm_parser.parse_image_embeds(content, uuid)
         modality = "image"
     elif part_type == "audio_url":
@@ -1411,8 +1408,8 @@ def parse_chat_messages(
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
-    Optional[MultiModalDataDict],
-    Optional[MultiModalUUIDDict],
+    MultiModalDataDict | None,
+    MultiModalUUIDDict | None,
 ]:
     conversation: list[ConversationMessage] = []
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
@@ -1443,8 +1440,8 @@ def parse_chat_messages_futures(
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
-    Awaitable[Optional[MultiModalDataDict]],
-    Optional[MultiModalUUIDDict],
+    Awaitable[MultiModalDataDict | None],
+    MultiModalUUIDDict | None,
 ]:
     conversation: list[ConversationMessage] = []
     mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
@@ -1498,7 +1495,7 @@ _cached_resolve_chat_template_kwargs = lru_cache(_resolve_chat_template_kwargs)
 
 
 def resolve_chat_template_kwargs(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
     chat_template: str,
     chat_template_kwargs: dict[str, Any],
 ) -> dict[str, Any]:
@@ -1518,10 +1515,10 @@ def resolve_chat_template_kwargs(
 
 
 def apply_hf_chat_template(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
     conversation: list[ConversationMessage],
-    chat_template: Optional[str],
-    tools: Optional[list[dict[str, Any]]],
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
     *,
     model_config: ModelConfig,
     tokenize: bool = False,  # Different from HF's default
@@ -1569,8 +1566,8 @@ def apply_hf_chat_template(
 def apply_mistral_chat_template(
     tokenizer: MistralTokenizer,
     messages: list[ChatCompletionMessageParam],
-    chat_template: Optional[str],
-    tools: Optional[list[dict[str, Any]]],
+    chat_template: str | None,
+    tools: list[dict[str, Any]] | None,
     **kwargs: Any,
 ) -> list[int]:
     from mistral_common.exceptions import MistralCommonException
diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py
index d7455daa1a6b7..7a1d247760095 100644
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import argparse
 import typing
 
@@ -12,6 +10,8 @@ from vllm.entrypoints.utils import VLLM_SUBCMD_PARSER_EPILOG
 
 if typing.TYPE_CHECKING:
     from vllm.utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = argparse.ArgumentParser
 
 
 class BenchmarkSubcommand(CLISubcommand):
diff --git a/vllm/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py
index e79a7efec6bac..e47dce0a401a2 100644
--- a/vllm/entrypoints/cli/collect_env.py
+++ b/vllm/entrypoints/cli/collect_env.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import argparse
 import typing
 
@@ -11,6 +9,8 @@ from vllm.entrypoints.cli.types import CLISubcommand
 
 if typing.TYPE_CHECKING:
     from vllm.utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = argparse.ArgumentParser
 
 
 class CollectEnvSubcommand(CLISubcommand):
diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index cb15952f0d2de..213a466036222 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -5,8 +5,6 @@
 Note that all future modules must be lazily loaded within main
 to avoid certain eager import breakage."""
 
-from __future__ import annotations
-
 import importlib.metadata
 import sys
 
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index 5372210bbf55c..a27c6fe6618a1 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import argparse
 import os
 import signal
@@ -16,6 +14,8 @@ from vllm.entrypoints.cli.types import CLISubcommand
 
 if TYPE_CHECKING:
     from vllm.utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = argparse.ArgumentParser
 
 
 def _register_signal_handlers():
diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py
index 6e7a15ada49cf..4b18ceb5215fa 100644
--- a/vllm/entrypoints/cli/run_batch.py
+++ b/vllm/entrypoints/cli/run_batch.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import argparse
 import asyncio
 import importlib.metadata
@@ -14,6 +12,8 @@ from vllm.logger import init_logger
 
 if typing.TYPE_CHECKING:
     from vllm.utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = argparse.ArgumentParser
 
 logger = init_logger(__name__)
 
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index b3960b74cf019..350add801038d 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -3,7 +3,6 @@
 
 import argparse
 import signal
-from typing import Optional
 
 import uvloop
 
@@ -179,7 +178,7 @@ def run_multi_api_server(args: argparse.Namespace):
     hybrid_dp_lb = parallel_config.data_parallel_hybrid_lb
     assert external_dp_lb or hybrid_dp_lb or dp_rank == 0
 
-    api_server_manager: Optional[APIServerProcessManager] = None
+    api_server_manager: APIServerProcessManager | None = None
 
     with launch_core_engines(
         vllm_config, executor_class, log_stats, num_api_servers
diff --git a/vllm/entrypoints/cli/types.py b/vllm/entrypoints/cli/types.py
index 6194f421a1bb4..f4eeb5b3c2e19 100644
--- a/vllm/entrypoints/cli/types.py
+++ b/vllm/entrypoints/cli/types.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import argparse
 import typing
 
 if typing.TYPE_CHECKING:
     from vllm.utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = argparse.ArgumentParser
 
 
 class CLISubcommand:
diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index f410ee9c40456..c694bcfaaa756 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -6,7 +6,7 @@ import json
 import logging
 from abc import ABC, abstractmethod
 from contextlib import AsyncExitStack
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Union
 
 from openai.types.responses.tool import Mcp
 from openai_harmony import Author, Message, Role, StreamState, TextContent
@@ -82,7 +82,7 @@ class ConversationContext(ABC):
     @abstractmethod
     async def init_tool_sessions(
         self,
-        tool_server: Optional[ToolServer],
+        tool_server: ToolServer | None,
         exit_stack: AsyncExitStack,
         request_id: str,
         mcp_tools: dict[str, Mcp],
@@ -122,7 +122,7 @@ class SimpleContext(ConversationContext):
 
     async def init_tool_sessions(
         self,
-        tool_server: Optional[ToolServer],
+        tool_server: ToolServer | None,
         exit_stack: AsyncExitStack,
         request_id: str,
         mcp_tools: dict[str, Mcp],
@@ -140,9 +140,9 @@ class HarmonyContext(ConversationContext):
         available_tools: list[str],
     ):
         self._messages = messages
-        self.finish_reason: Optional[str] = None
+        self.finish_reason: str | None = None
         self.available_tools = available_tools
-        self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
+        self._tool_sessions: dict[str, ClientSession | Tool] = {}
         self.called_tools: set[str] = set()
 
         self.parser = get_streamable_parser_for_assistant()
@@ -164,7 +164,7 @@ class HarmonyContext(ConversationContext):
         if self.parser.current_channel in {"analysis", "commentary"}:
             self.num_reasoning_tokens += 1
 
-    def append_output(self, output: Union[RequestOutput, list[Message]]) -> None:
+    def append_output(self, output: RequestOutput | list[Message]) -> None:
         if isinstance(output, RequestOutput):
             output_token_ids = output.outputs[0].token_ids
             self.parser = get_streamable_parser_for_assistant()
@@ -358,7 +358,7 @@ class HarmonyContext(ConversationContext):
 
     async def init_tool_sessions(
         self,
-        tool_server: Optional[ToolServer],
+        tool_server: ToolServer | None,
         exit_stack: AsyncExitStack,
         request_id: str,
         mcp_tools: dict[str, Mcp],
@@ -446,7 +446,7 @@ class StreamingHarmonyContext(HarmonyContext):
     def messages(self) -> list:
         return self._messages
 
-    def append_output(self, output: Union[RequestOutput, list[Message]]) -> None:
+    def append_output(self, output: RequestOutput | list[Message]) -> None:
         if isinstance(output, RequestOutput):
             # append_output is called for each output token in streaming case,
             # so we only want to add the prompt tokens once for each message.
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index 53a08b1a4485c..fe581e5484e1f 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -1,12 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import datetime
 import json
 from collections.abc import Iterable, Sequence
-from typing import Literal, Union
+from typing import Literal
 
 from openai.types.responses import (
     ResponseFunctionToolCall,
@@ -122,7 +120,7 @@ def get_system_message(
     return sys_msg
 
 
-def create_tool_definition(tool: Union[ChatCompletionToolsParam, Tool]):
+def create_tool_definition(tool: ChatCompletionToolsParam | Tool):
     if isinstance(tool, ChatCompletionToolsParam):
         return ToolDescription.new(
             name=tool.function.name,
@@ -138,13 +136,13 @@ def create_tool_definition(tool: Union[ChatCompletionToolsParam, Tool]):
 
 def get_developer_message(
     instructions: str | None = None,
-    tools: list[Union[Tool, ChatCompletionToolsParam]] | None = None,
+    tools: list[Tool | ChatCompletionToolsParam] | None = None,
 ) -> Message:
     dev_msg_content = DeveloperContent.new()
     if instructions is not None and not envs.VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS:
         dev_msg_content = dev_msg_content.with_instructions(instructions)
     if tools is not None:
-        function_tools: list[Union[Tool, ChatCompletionToolsParam]] = []
+        function_tools: list[Tool | ChatCompletionToolsParam] = []
         for tool in tools:
             if tool.type in (
                 "web_search_preview",
@@ -178,7 +176,7 @@ def get_user_message(content: str) -> Message:
 
 def parse_response_input(
     response_msg: ResponseInputOutputItem,
-    prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]],
+    prev_responses: list[ResponseOutputItem | ResponseReasoningItem],
 ) -> Message:
     if not isinstance(response_msg, dict):
         response_msg = response_msg.model_dump()
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 349437363c5b8..49bb86291f8b6 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -5,7 +5,7 @@ import asyncio
 import signal
 import socket
 from http import HTTPStatus
-from typing import Any, Optional
+from typing import Any
 
 import uvicorn
 from fastapi import FastAPI, Request, Response
@@ -26,7 +26,7 @@ logger = init_logger(__name__)
 
 async def serve_http(
     app: FastAPI,
-    sock: Optional[socket.socket],
+    sock: socket.socket | None,
     enable_ssl_refresh: bool = False,
     **uvicorn_kwargs: Any,
 ):
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 8f47c20f27e0a..668344fdcc34c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import itertools
-from collections.abc import Sequence
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union, cast
+from collections.abc import Callable, Sequence
+from typing import TYPE_CHECKING, Any, cast
 
 import cloudpickle
 import torch.nn as nn
@@ -191,36 +191,34 @@ class LLM:
         *,
         runner: RunnerOption = "auto",
         convert: ConvertOption = "auto",
-        tokenizer: Optional[str] = None,
+        tokenizer: str | None = None,
         tokenizer_mode: TokenizerMode = "auto",
         skip_tokenizer_init: bool = False,
         trust_remote_code: bool = False,
         allowed_local_media_path: str = "",
-        allowed_media_domains: Optional[list[str]] = None,
+        allowed_media_domains: list[str] | None = None,
         tensor_parallel_size: int = 1,
         dtype: ModelDType = "auto",
-        quantization: Optional[QuantizationMethods] = None,
-        revision: Optional[str] = None,
-        tokenizer_revision: Optional[str] = None,
-        seed: Optional[int] = None,
+        quantization: QuantizationMethods | None = None,
+        revision: str | None = None,
+        tokenizer_revision: str | None = None,
+        seed: int | None = None,
         gpu_memory_utilization: float = 0.9,
         swap_space: float = 4,
         cpu_offload_gb: float = 0,
         enforce_eager: bool = False,
         disable_custom_all_reduce: bool = False,
-        hf_token: Optional[Union[bool, str]] = None,
-        hf_overrides: Optional[HfOverrides] = None,
-        mm_processor_kwargs: Optional[dict[str, Any]] = None,
-        pooler_config: Optional[PoolerConfig] = None,
-        override_pooler_config: Optional[PoolerConfig] = None,
-        structured_outputs_config: Optional[
-            Union[dict[str, Any], StructuredOutputsConfig]
-        ] = None,
-        kv_cache_memory_bytes: Optional[int] = None,
-        compilation_config: Optional[
-            Union[int, dict[str, Any], CompilationConfig]
-        ] = None,
-        logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None,
+        hf_token: bool | str | None = None,
+        hf_overrides: HfOverrides | None = None,
+        mm_processor_kwargs: dict[str, Any] | None = None,
+        pooler_config: PoolerConfig | None = None,
+        override_pooler_config: PoolerConfig | None = None,
+        structured_outputs_config: dict[str, Any]
+        | StructuredOutputsConfig
+        | None = None,
+        kv_cache_memory_bytes: int | None = None,
+        compilation_config: int | dict[str, Any] | CompilationConfig | None = None,
+        logits_processors: list[str | type[LogitsProcessor]] | None = None,
         **kwargs: Any,
     ) -> None:
         """LLM constructor."""
@@ -331,7 +329,7 @@ class LLM:
         self.engine_class = type(self.llm_engine)
 
         self.request_counter = Counter()
-        self.default_sampling_params: Union[dict[str, Any], None] = None
+        self.default_sampling_params: dict[str, Any] | None = None
 
         supported_tasks = self.llm_engine.get_supported_tasks()
         logger.info("Supported tasks: %s", supported_tasks)
@@ -367,14 +365,12 @@ class LLM:
 
     def generate(
         self,
-        prompts: Union[PromptType, Sequence[PromptType]],
-        sampling_params: Optional[
-            Union[SamplingParams, Sequence[SamplingParams]]
-        ] = None,
+        prompts: PromptType | Sequence[PromptType],
+        sampling_params: SamplingParams | Sequence[SamplingParams] | None = None,
         *,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
-        priority: Optional[list[int]] = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
+        priority: list[int] | None = None,
     ) -> list[RequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -437,8 +433,8 @@ class LLM:
 
     def _get_modality_specific_lora_reqs(
         self,
-        prompts: Union[PromptType, Sequence[PromptType]],
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]],
+        prompts: PromptType | Sequence[PromptType],
+        lora_request: list[LoRARequest] | LoRARequest | None,
     ):
         # Grab the lora config off the vllm config on the engine,
         # since this is the same for both v0 & v1.
@@ -474,8 +470,8 @@ class LLM:
     def _resolve_single_prompt_mm_lora(
         self,
         prompt: PromptType,
-        lora_request: Optional[LoRARequest],
-        default_mm_loras: Optional[dict[str, str]],
+        lora_request: LoRARequest | None,
+        default_mm_loras: dict[str, str] | None,
     ):
         if (
             not default_mm_loras
@@ -526,10 +522,10 @@ class LLM:
 
     def collective_rpc(
         self,
-        method: Union[str, Callable[..., _R]],
-        timeout: Optional[float] = None,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict[str, Any]] = None,
+        kwargs: dict[str, Any] | None = None,
     ) -> list[_R]:
         """
         Execute an RPC call on all workers.
@@ -571,9 +567,9 @@ class LLM:
 
     def _get_beam_search_lora_requests(
         self,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]],
-        prompts: list[Union[TokensPrompt, TextPrompt]],
-    ) -> list[Optional[LoRARequest]]:
+        lora_request: list[LoRARequest] | LoRARequest | None,
+        prompts: list[TokensPrompt | TextPrompt],
+    ) -> list[LoRARequest | None]:
         """Get the optional lora request corresponding to each prompt."""
         if isinstance(lora_request, Sequence) and len(lora_request) != len(prompts):
             raise ValueError(
@@ -587,11 +583,11 @@ class LLM:
 
     def beam_search(
         self,
-        prompts: list[Union[TokensPrompt, TextPrompt]],
+        prompts: list[TokensPrompt | TextPrompt],
         params: BeamSearchParams,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
         use_tqdm: bool = False,
-        concurrency_limit: Optional[int] = None,
+        concurrency_limit: int | None = None,
     ) -> list[BeamSearchOutput]:
         """
         Generate sequences using beam search.
@@ -770,16 +766,15 @@ class LLM:
 
     def preprocess_chat(
         self,
-        messages: Union[
-            list[ChatCompletionMessageParam], list[list[ChatCompletionMessageParam]]
-        ],
-        chat_template: Optional[str] = None,
+        messages: list[ChatCompletionMessageParam]
+        | list[list[ChatCompletionMessageParam]],
+        chat_template: str | None = None,
         chat_template_content_format: ChatTemplateContentFormatOption = "auto",
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
-        tools: Optional[list[dict[str, Any]]] = None,
-        chat_template_kwargs: Optional[dict[str, Any]] = None,
-        mm_processor_kwargs: Optional[dict[str, Any]] = None,
+        tools: list[dict[str, Any]] | None = None,
+        chat_template_kwargs: dict[str, Any] | None = None,
+        mm_processor_kwargs: dict[str, Any] | None = None,
     ) -> list[TokensPrompt]:
         """
         Generate prompt for a chat conversation. The pre-processed
@@ -868,19 +863,18 @@ class LLM:
 
     def chat(
         self,
-        messages: Union[
-            list[ChatCompletionMessageParam], list[list[ChatCompletionMessageParam]]
-        ],
-        sampling_params: Optional[Union[SamplingParams, list[SamplingParams]]] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[LoRARequest] = None,
-        chat_template: Optional[str] = None,
+        messages: list[ChatCompletionMessageParam]
+        | list[list[ChatCompletionMessageParam]],
+        sampling_params: SamplingParams | list[SamplingParams] | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        lora_request: LoRARequest | None = None,
+        chat_template: str | None = None,
         chat_template_content_format: ChatTemplateContentFormatOption = "auto",
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
-        tools: Optional[list[dict[str, Any]]] = None,
-        chat_template_kwargs: Optional[dict[str, Any]] = None,
-        mm_processor_kwargs: Optional[dict[str, Any]] = None,
+        tools: list[dict[str, Any]] | None = None,
+        chat_template_kwargs: dict[str, Any] | None = None,
+        mm_processor_kwargs: dict[str, Any] | None = None,
     ) -> list[RequestOutput]:
         """
         Generate responses for a chat conversation.
@@ -953,14 +947,14 @@ class LLM:
 
     def encode(
         self,
-        prompts: Union[PromptType, Sequence[PromptType], DataPrompt],
-        pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None,
+        prompts: PromptType | Sequence[PromptType] | DataPrompt,
+        pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
         *,
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        truncate_prompt_tokens: int | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
         pooling_task: PoolingTask = "encode",
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[PoolingRequestOutput]:
         """Apply pooling to the hidden states corresponding to the input
         prompts.
@@ -1086,12 +1080,12 @@ class LLM:
 
     def embed(
         self,
-        prompts: Union[PromptType, Sequence[PromptType]],
+        prompts: PromptType | Sequence[PromptType],
         *,
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        truncate_prompt_tokens: int | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
     ) -> list[EmbeddingRequestOutput]:
         """
         Generate an embedding vector for each prompt.
@@ -1135,11 +1129,11 @@ class LLM:
 
     def classify(
         self,
-        prompts: Union[PromptType, Sequence[PromptType]],
+        prompts: PromptType | Sequence[PromptType],
         *,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
     ) -> list[ClassificationRequestOutput]:
         """
         Generate class logits for each prompt.
@@ -1181,13 +1175,13 @@ class LLM:
 
     def reward(
         self,
-        prompts: Union[PromptType, Sequence[PromptType]],
+        prompts: PromptType | Sequence[PromptType],
         /,
         *,
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        truncate_prompt_tokens: int | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
     ) -> list[PoolingRequestOutput]:
         """
         Generate rewards for each prompt.
@@ -1220,12 +1214,12 @@ class LLM:
     def _embedding_score(
         self,
         tokenizer: AnyTokenizer,
-        text_1: list[Union[str, TextPrompt, TokensPrompt]],
-        text_2: list[Union[str, TextPrompt, TokensPrompt]],
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        pooling_params: Optional[PoolingParams] = None,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        text_1: list[str | TextPrompt | TokensPrompt],
+        text_2: list[str | TextPrompt | TokensPrompt],
+        truncate_prompt_tokens: int | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        pooling_params: PoolingParams | None = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
     ) -> list[ScoringRequestOutput]:
         encoded_output: list[PoolingRequestOutput] = self.encode(
             text_1 + text_2,
@@ -1252,12 +1246,12 @@ class LLM:
     def _cross_encoding_score(
         self,
         tokenizer: AnyTokenizer,
-        data_1: Union[list[str], list[ScoreContentPartParam]],
-        data_2: Union[list[str], list[ScoreContentPartParam]],
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        pooling_params: Optional[PoolingParams] = None,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        data_1: list[str] | list[ScoreContentPartParam],
+        data_2: list[str] | list[ScoreContentPartParam],
+        truncate_prompt_tokens: int | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        pooling_params: PoolingParams | None = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
     ) -> list[ScoringRequestOutput]:
         model_config = self.model_config
 
@@ -1316,14 +1310,14 @@ class LLM:
 
     def score(
         self,
-        data_1: Union[SingletonPrompt, Sequence[SingletonPrompt], ScoreMultiModalParam],
-        data_2: Union[SingletonPrompt, Sequence[SingletonPrompt], ScoreMultiModalParam],
+        data_1: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam,
+        data_2: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam,
         /,
         *,
-        truncate_prompt_tokens: Optional[int] = None,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        pooling_params: Optional[PoolingParams] = None,
-        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
+        truncate_prompt_tokens: int | None = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        pooling_params: PoolingParams | None = None,
+        lora_request: list[LoRARequest] | LoRARequest | None = None,
     ) -> list[ScoringRequestOutput]:
         """Generate similarity scores for all pairs `<text,text_pair>` or
           `<multi-modal data, multi-modal data pair>`.
@@ -1391,9 +1385,9 @@ class LLM:
         if not model_config.is_multimodal_model:
 
             def check_data_type(
-                data: Union[
-                    SingletonPrompt, Sequence[SingletonPrompt], ScoreMultiModalParam
-                ],
+                data: SingletonPrompt
+                | Sequence[SingletonPrompt]
+                | ScoreMultiModalParam,
             ):
                 if isinstance(data, dict) and "content" in data:
                     raise ValueError(
@@ -1470,7 +1464,7 @@ class LLM:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+    def reset_prefix_cache(self, device: Device | None = None) -> bool:
         return self.llm_engine.reset_prefix_cache(device)
 
     def sleep(self, level: int = 1):
@@ -1495,7 +1489,7 @@ class LLM:
         self.reset_prefix_cache()
         self.llm_engine.sleep(level=level)
 
-    def wake_up(self, tags: Optional[list[str]] = None):
+    def wake_up(self, tags: list[str] | None = None):
         """
         Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
         method for more details.
@@ -1523,17 +1517,15 @@ class LLM:
 
     def _validate_and_add_requests(
         self,
-        prompts: Union[PromptType, Sequence[PromptType], DataPrompt],
-        params: Union[
-            SamplingParams,
-            Sequence[SamplingParams],
-            PoolingParams,
-            Sequence[PoolingParams],
-        ],
+        prompts: PromptType | Sequence[PromptType] | DataPrompt,
+        params: SamplingParams
+        | Sequence[SamplingParams]
+        | PoolingParams
+        | Sequence[PoolingParams],
         *,
-        use_tqdm: Union[bool, Callable[..., tqdm]] = True,
-        lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
-        priority: Optional[list[int]] = None,
+        use_tqdm: bool | Callable[..., tqdm] = True,
+        lora_request: Sequence[LoRARequest] | LoRARequest | None,
+        priority: list[int] | None = None,
     ) -> None:
         if isinstance(prompts, (str, dict)):
             # Convert a single prompt to a list.
@@ -1575,8 +1567,8 @@ class LLM:
 
     def _validate_mm_data_and_uuids(
         self,
-        multi_modal_data: Optional[Any],  # MultiModalDataDict
-        multi_modal_uuids: Optional[Any],  # MultiModalUUIDDict
+        multi_modal_data: Any | None,  # MultiModalDataDict
+        multi_modal_uuids: Any | None,  # MultiModalUUIDDict
     ):
         """
         Validate that if any multi-modal data is skipped (i.e. None),
@@ -1625,9 +1617,9 @@ class LLM:
         self,
         request_id: str,
         engine_prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams | PoolingParams,
         *,
-        lora_request: Optional[LoRARequest],
+        lora_request: LoRARequest | None,
         priority: int,
     ) -> tuple[EngineCoreRequest, dict[str, Any]]:
         """Use the Processor to process inputs for LLMEngine."""
@@ -1651,8 +1643,8 @@ class LLM:
     def _add_request(
         self,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        lora_request: Optional[LoRARequest] = None,
+        params: SamplingParams | PoolingParams,
+        lora_request: LoRARequest | None = None,
         priority: int = 0,
     ) -> None:
         prompt_text, _, _ = get_prompt_components(prompt)
@@ -1677,8 +1669,8 @@ class LLM:
         )
 
     def _run_engine(
-        self, *, use_tqdm: Union[bool, Callable[..., tqdm]] = True
-    ) -> list[Union[RequestOutput, PoolingRequestOutput]]:
+        self, *, use_tqdm: bool | Callable[..., tqdm] = True
+    ) -> list[RequestOutput | PoolingRequestOutput]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
@@ -1691,7 +1683,7 @@ class LLM:
             )
 
         # Run the engine.
-        outputs: list[Union[RequestOutput, PoolingRequestOutput]] = []
+        outputs: list[RequestOutput | PoolingRequestOutput] = []
         total_in_toks = 0
         total_out_toks = 0
         while self.llm_engine.has_unfinished_requests():
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
index 96a84668e92b3..c43d5ddf82fbd 100644
--- a/vllm/entrypoints/logger.py
+++ b/vllm/entrypoints/logger.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional, Union
 
 import torch
 
@@ -15,17 +14,17 @@ logger = init_logger(__name__)
 
 
 class RequestLogger:
-    def __init__(self, *, max_log_len: Optional[int]) -> None:
+    def __init__(self, *, max_log_len: int | None) -> None:
         self.max_log_len = max_log_len
 
     def log_inputs(
         self,
         request_id: str,
-        prompt: Optional[str],
-        prompt_token_ids: Optional[list[int]],
-        prompt_embeds: Optional[torch.Tensor],
-        params: Optional[Union[SamplingParams, PoolingParams, BeamSearchParams]],
-        lora_request: Optional[LoRARequest],
+        prompt: str | None,
+        prompt_token_ids: list[int] | None,
+        prompt_embeds: torch.Tensor | None,
+        params: SamplingParams | PoolingParams | BeamSearchParams | None,
+        lora_request: LoRARequest | None,
     ) -> None:
         max_log_len = self.max_log_len
         if max_log_len is not None:
@@ -52,8 +51,8 @@ class RequestLogger:
         self,
         request_id: str,
         outputs: str,
-        output_token_ids: Optional[Sequence[int]],
-        finish_reason: Optional[str] = None,
+        output_token_ids: Sequence[int] | None,
+        finish_reason: str | None = None,
         is_streaming: bool = False,
         delta: bool = False,
     ) -> None:
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 5d5baad00da16..96a0947c4bd31 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -16,10 +16,10 @@ import socket
 import tempfile
 import uuid
 from argparse import Namespace
-from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
+from collections.abc import AsyncGenerator, AsyncIterator, Awaitable, Callable
 from contextlib import asynccontextmanager
 from http import HTTPStatus
-from typing import Annotated, Any, Callable, Literal, Optional
+from typing import Annotated, Any, Literal
 
 import prometheus_client
 import pydantic
@@ -166,8 +166,8 @@ async def build_async_engine_client(
     args: Namespace,
     *,
     usage_context: UsageContext = UsageContext.OPENAI_API_SERVER,
-    disable_frontend_multiprocessing: Optional[bool] = None,
-    client_config: Optional[dict[str, Any]] = None,
+    disable_frontend_multiprocessing: bool | None = None,
+    client_config: dict[str, Any] | None = None,
 ) -> AsyncIterator[EngineClient]:
     if os.getenv("VLLM_WORKER_MULTIPROC_METHOD") == "forkserver":
         # The executor is expected to be mp.
@@ -203,7 +203,7 @@ async def build_async_engine_client_from_engine_args(
     *,
     usage_context: UsageContext = UsageContext.OPENAI_API_SERVER,
     disable_frontend_multiprocessing: bool = False,
-    client_config: Optional[dict[str, Any]] = None,
+    client_config: dict[str, Any] | None = None,
 ) -> AsyncIterator[EngineClient]:
     """
     Create EngineClient, either:
@@ -227,7 +227,7 @@ async def build_async_engine_client_from_engine_args(
 
     from vllm.v1.engine.async_llm import AsyncLLM
 
-    async_llm: Optional[AsyncLLM] = None
+    async_llm: AsyncLLM | None = None
 
     # Don't mutate the input client_config
     client_config = dict(client_config) if client_config else {}
@@ -308,35 +308,35 @@ def models(request: Request) -> OpenAIServingModels:
     return request.app.state.openai_serving_models
 
 
-def responses(request: Request) -> Optional[OpenAIServingResponses]:
+def responses(request: Request) -> OpenAIServingResponses | None:
     return request.app.state.openai_serving_responses
 
 
-def chat(request: Request) -> Optional[OpenAIServingChat]:
+def chat(request: Request) -> OpenAIServingChat | None:
     return request.app.state.openai_serving_chat
 
 
-def completion(request: Request) -> Optional[OpenAIServingCompletion]:
+def completion(request: Request) -> OpenAIServingCompletion | None:
     return request.app.state.openai_serving_completion
 
 
-def pooling(request: Request) -> Optional[OpenAIServingPooling]:
+def pooling(request: Request) -> OpenAIServingPooling | None:
     return request.app.state.openai_serving_pooling
 
 
-def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
+def embedding(request: Request) -> OpenAIServingEmbedding | None:
     return request.app.state.openai_serving_embedding
 
 
-def score(request: Request) -> Optional[ServingScores]:
+def score(request: Request) -> ServingScores | None:
     return request.app.state.openai_serving_scores
 
 
-def classify(request: Request) -> Optional[ServingClassification]:
+def classify(request: Request) -> ServingClassification | None:
     return request.app.state.openai_serving_classification
 
 
-def rerank(request: Request) -> Optional[ServingScores]:
+def rerank(request: Request) -> ServingScores | None:
     return request.app.state.openai_serving_scores
 
 
@@ -542,8 +542,8 @@ async def create_responses(request: ResponsesRequest, raw_request: Request):
 async def retrieve_responses(
     response_id: str,
     raw_request: Request,
-    starting_after: Optional[int] = None,
-    stream: Optional[bool] = False,
+    starting_after: int | None = None,
+    stream: bool | None = False,
 ):
     handler = responses(raw_request)
     if handler is None:
@@ -1039,7 +1039,7 @@ if envs.VLLM_SERVER_DEV_MODE:
         # User-defined `method` is responsible for deserialization if needed.
         args: list[str] = body.get("args", [])
         kwargs: dict[str, str] = body.get("kwargs", {})
-        timeout: Optional[float] = body.get("timeout")
+        timeout: float | None = body.get("timeout")
         results = await engine_client(raw_request).collective_rpc(
             method=method, timeout=timeout, args=tuple(args), kwargs=kwargs
         )
@@ -1120,7 +1120,7 @@ async def is_scaling_elastic_ep(raw_request: Request):
 # TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
 # (requires typing_extensions >= 4.13)
 RequestType = Any
-GetHandlerFn = Callable[[Request], Optional[OpenAIServing]]
+GetHandlerFn = Callable[[Request], OpenAIServing | None]
 EndpointFn = Callable[[RequestType, Request], Awaitable[Any]]
 
 # NOTE: Items defined earlier take higher priority
@@ -1236,7 +1236,7 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
         return Response(status_code=200, content=response)
 
 
-def load_log_config(log_config_file: Optional[str]) -> Optional[dict]:
+def load_log_config(log_config_file: str | None) -> dict | None:
     if not log_config_file:
         return None
     try:
@@ -1655,7 +1655,7 @@ async def init_app_state(
                 )
 
     if args.tool_server == "demo":
-        tool_server: Optional[ToolServer] = DemoToolServer()
+        tool_server: ToolServer | None = DemoToolServer()
         assert isinstance(tool_server, DemoToolServer)
         await tool_server.init_and_validate()
     elif args.tool_server:
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 1f16646db63b8..99d6cbaa86b8f 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -11,7 +11,7 @@ import json
 import ssl
 from collections.abc import Sequence
 from dataclasses import field
-from typing import Literal, Optional, Union
+from typing import Literal
 
 from pydantic.dataclasses import dataclass
 
@@ -39,8 +39,8 @@ class LoRAParserAction(argparse.Action):
         self,
         parser: argparse.ArgumentParser,
         namespace: argparse.Namespace,
-        values: Optional[Union[str, Sequence[str]]],
-        option_string: Optional[str] = None,
+        values: str | Sequence[str] | None,
+        option_string: str | None = None,
     ):
         if values is None:
             values = []
@@ -73,11 +73,11 @@ class LoRAParserAction(argparse.Action):
 class FrontendArgs:
     """Arguments for the OpenAI-compatible frontend server."""
 
-    host: Optional[str] = None
+    host: str | None = None
     """Host name."""
     port: int = 8000
     """Port number."""
-    uds: Optional[str] = None
+    uds: str | None = None
     """Unix domain socket path. If set, host and port arguments are ignored."""
     uvicorn_log_level: Literal[
         "debug", "info", "warning", "error", "critical", "trace"
@@ -93,15 +93,15 @@ class FrontendArgs:
     """Allowed methods."""
     allowed_headers: list[str] = field(default_factory=lambda: ["*"])
     """Allowed headers."""
-    api_key: Optional[list[str]] = None
+    api_key: list[str] | None = None
     """If provided, the server will require one of these keys to be presented in
     the header."""
-    lora_modules: Optional[list[LoRAModulePath]] = None
+    lora_modules: list[LoRAModulePath] | None = None
     """LoRA modules configurations in either 'name=path' format or JSON format
     or JSON list format. Example (old format): `'name=path'` Example (new
     format): `{\"name\": \"name\", \"path\": \"lora_path\",
     \"base_model_name\": \"id\"}`"""
-    chat_template: Optional[str] = None
+    chat_template: str | None = None
     """The file path to the chat template, or the template in single-line form
     for the specified model."""
     chat_template_content_format: ChatTemplateContentFormatOption = "auto"
@@ -116,17 +116,17 @@ class FrontendArgs:
     or the ones from tokenizer."""
     response_role: str = "assistant"
     """The role name to return if `request.add_generation_prompt=true`."""
-    ssl_keyfile: Optional[str] = None
+    ssl_keyfile: str | None = None
     """The file path to the SSL key file."""
-    ssl_certfile: Optional[str] = None
+    ssl_certfile: str | None = None
     """The file path to the SSL cert file."""
-    ssl_ca_certs: Optional[str] = None
+    ssl_ca_certs: str | None = None
     """The CA certificates file."""
     enable_ssl_refresh: bool = False
     """Refresh SSL Context when SSL certificate files change"""
     ssl_cert_reqs: int = int(ssl.CERT_NONE)
     """Whether client certificate is required (see stdlib ssl module's)."""
-    root_path: Optional[str] = None
+    root_path: str | None = None
     """FastAPI root_path when app is behind a path based routing proxy."""
     middleware: list[str] = field(default_factory=lambda: [])
     """Additional ASGI middleware to apply to the app. We accept multiple
@@ -149,7 +149,7 @@ class FrontendArgs:
     exclude_tools_when_tool_choice_none: bool = False
     """If specified, exclude tool definitions in prompts when
     tool_choice='none'."""
-    tool_call_parser: Optional[str] = None
+    tool_call_parser: str | None = None
     """Select the tool call parser depending on the model that you're using.
     This is used to parse the model-generated tool call into OpenAI API format.
     Required for `--enable-auto-tool-choice`. You can choose any option from
@@ -158,13 +158,13 @@ class FrontendArgs:
     """Special the tool parser plugin write to parse the model-generated tool
     into OpenAI API format, the name register in this plugin can be used in
     `--tool-call-parser`."""
-    tool_server: Optional[str] = None
+    tool_server: str | None = None
     """Comma-separated list of host:port pairs (IPv4, IPv6, or hostname).
     Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo
     purpose."""
-    log_config_file: Optional[str] = envs.VLLM_LOGGING_CONFIG_PATH
+    log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH
     """Path to logging config JSON file for both vllm and uvicorn"""
-    max_log_len: Optional[int] = None
+    max_log_len: int | None = None
     """Max number of prompt characters or prompt ID numbers being printed in
     log. The default of None means unlimited."""
     disable_fastapi_docs: bool = False
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index 2ea9fbf386ba1..dedbc23ec83fa 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -3,7 +3,6 @@
 
 from collections.abc import Iterable
 from functools import lru_cache, partial
-from typing import Optional, Union
 
 import torch
 
@@ -16,8 +15,8 @@ class AllowedTokenIdsLogitsProcessor:
     specific set of token ids."""
 
     def __init__(self, allowed_ids: Iterable[int]):
-        self.allowed_ids: Optional[list[int]] = list(allowed_ids)
-        self.mask: Optional[torch.Tensor] = None
+        self.allowed_ids: list[int] | None = list(allowed_ids)
+        self.mask: torch.Tensor | None = None
 
     def __call__(self, token_ids: list[int], logits: torch.Tensor) -> torch.Tensor:
         if self.mask is None:
@@ -53,8 +52,8 @@ def logit_bias_logits_processor(
 
 
 def get_logits_processors(
-    logit_bias: Optional[Union[dict[int, float], dict[str, float]]],
-    allowed_token_ids: Optional[list[int]],
+    logit_bias: dict[int, float] | dict[str, float] | None,
+    allowed_token_ids: list[int] | None,
     tokenizer: AnyTokenizer,
 ) -> list[LogitsProcessor]:
     logits_processors: list[LogitsProcessor] = []
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 6ff7ceef48055..8c4c31bda18ca 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -6,7 +6,7 @@
 import json
 import time
 from http import HTTPStatus
-from typing import Annotated, Any, ClassVar, Generic, Literal, Optional, TypeVar, Union
+from typing import Annotated, Any, ClassVar, Generic, Literal, TypeAlias, TypeVar
 
 import regex as re
 import torch
@@ -54,6 +54,7 @@ try:  # For older openai versions (< 1.100.0)
 except ImportError:  # For newer openai versions (>= 1.100.0)
     from openai.types.responses import ResponseFormatTextConfig as ResponseTextConfig
 
+
 from openai.types.responses.response import IncompleteDetails, ToolChoice
 from openai.types.responses.tool import Tool
 from openai.types.shared import Metadata, Reasoning
@@ -67,7 +68,6 @@ from pydantic import (
     field_validator,
     model_validator,
 )
-from typing_extensions import TypeAlias
 
 from vllm import envs
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id
@@ -93,7 +93,7 @@ class OpenAIBaseModel(BaseModel):
     model_config = ConfigDict(extra="allow")
 
     # Cache class field names
-    field_names: ClassVar[Optional[set[str]]] = None
+    field_names: ClassVar[set[str] | None] = None
 
     @model_validator(mode="wrap")
     @classmethod
@@ -123,7 +123,7 @@ class OpenAIBaseModel(BaseModel):
 class ErrorInfo(OpenAIBaseModel):
     message: str
     type: str
-    param: Optional[str] = None
+    param: str | None = None
     code: int
 
 
@@ -142,7 +142,7 @@ class ModelPermission(OpenAIBaseModel):
     allow_view: bool = True
     allow_fine_tuning: bool = False
     organization: str = "*"
-    group: Optional[str] = None
+    group: str | None = None
     is_blocking: bool = False
 
 
@@ -151,9 +151,9 @@ class ModelCard(OpenAIBaseModel):
     object: str = "model"
     created: int = Field(default_factory=lambda: int(time.time()))
     owned_by: str = "vllm"
-    root: Optional[str] = None
-    parent: Optional[str] = None
-    max_model_len: Optional[int] = None
+    root: str | None = None
+    parent: str | None = None
+    max_model_len: int | None = None
     permission: list[ModelPermission] = Field(default_factory=list)
 
 
@@ -163,37 +163,35 @@ class ModelList(OpenAIBaseModel):
 
 
 class PromptTokenUsageInfo(OpenAIBaseModel):
-    cached_tokens: Optional[int] = None
+    cached_tokens: int | None = None
 
 
 class UsageInfo(OpenAIBaseModel):
     prompt_tokens: int = 0
     total_tokens: int = 0
-    completion_tokens: Optional[int] = 0
-    prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
+    completion_tokens: int | None = 0
+    prompt_tokens_details: PromptTokenUsageInfo | None = None
 
 
 class RequestResponseMetadata(BaseModel):
     request_id: str
-    final_usage_info: Optional[UsageInfo] = None
+    final_usage_info: UsageInfo | None = None
 
 
 class JsonSchemaResponseFormat(OpenAIBaseModel):
     name: str
-    description: Optional[str] = None
+    description: str | None = None
     # schema is the field in openai but that causes conflicts with pydantic so
     # instead use json_schema with an alias
-    json_schema: Optional[dict[str, Any]] = Field(default=None, alias="schema")
-    strict: Optional[bool] = None
+    json_schema: dict[str, Any] | None = Field(default=None, alias="schema")
+    strict: bool | None = None
 
 
 class StructuralTag(OpenAIBaseModel):
     begin: str
     # schema is the field, but that causes conflicts with pydantic so
     # instead use structural_tag_schema with an alias
-    structural_tag_schema: Optional[dict[str, Any]] = Field(
-        default=None, alias="schema"
-    )
+    structural_tag_schema: dict[str, Any] | None = Field(default=None, alias="schema")
     end: str
 
 
@@ -206,21 +204,21 @@ class StructuralTagResponseFormat(OpenAIBaseModel):
 class ResponseFormat(OpenAIBaseModel):
     # type must be "json_schema", "json_object", or "text"
     type: Literal["text", "json_object", "json_schema"]
-    json_schema: Optional[JsonSchemaResponseFormat] = None
+    json_schema: JsonSchemaResponseFormat | None = None
 
 
-AnyResponseFormat = Union[ResponseFormat, StructuralTagResponseFormat]
+AnyResponseFormat: TypeAlias = ResponseFormat | StructuralTagResponseFormat
 
 
 class StreamOptions(OpenAIBaseModel):
-    include_usage: Optional[bool] = True
-    continuous_usage_stats: Optional[bool] = False
+    include_usage: bool | None = True
+    continuous_usage_stats: bool | None = False
 
 
 class FunctionDefinition(OpenAIBaseModel):
     name: str
-    description: Optional[str] = None
-    parameters: Optional[dict[str, Any]] = None
+    description: str | None = None
+    parameters: dict[str, Any] | None = None
 
 
 class ChatCompletionToolsParam(OpenAIBaseModel):
@@ -241,18 +239,18 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
 # see https://github.com/pydantic/pydantic/issues/3125
 class LogitsProcessorConstructor(BaseModel):
     qualname: str
-    args: Optional[list[Any]] = None
-    kwargs: Optional[dict[str, Any]] = None
+    args: list[Any] | None = None
+    kwargs: dict[str, Any] | None = None
 
     model_config = ConfigDict(extra="forbid")
 
 
-LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
+LogitsProcessors = list[str | LogitsProcessorConstructor]
 
 
 def get_logits_processors(
-    processors: Optional[LogitsProcessors], pattern: Optional[str]
-) -> Optional[list[Any]]:
+    processors: LogitsProcessors | None, pattern: str | None
+) -> list[Any] | None:
     if processors and pattern:
         logits_processors = []
         for processor in processors:
@@ -284,16 +282,16 @@ def get_logits_processors(
     return None
 
 
-ResponseInputOutputItem: TypeAlias = Union[
-    ResponseInputItemParam, ResponseReasoningItem, ResponseFunctionToolCall
-]
+ResponseInputOutputItem: TypeAlias = (
+    ResponseInputItemParam | ResponseReasoningItem | ResponseFunctionToolCall
+)
 
 
 class ResponsesRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/responses/create
-    background: Optional[bool] = False
-    include: Optional[
+    background: bool | None = False
+    include: (
         list[
             Literal[
                 "code_interpreter_call.outputs",
@@ -304,28 +302,29 @@ class ResponsesRequest(OpenAIBaseModel):
                 "reasoning.encrypted_content",
             ],
         ]
-    ] = None
-    input: Union[str, list[ResponseInputOutputItem]]
-    instructions: Optional[str] = None
-    max_output_tokens: Optional[int] = None
-    max_tool_calls: Optional[int] = None
-    metadata: Optional[Metadata] = None
-    model: Optional[str] = None
-    parallel_tool_calls: Optional[bool] = True
-    previous_response_id: Optional[str] = None
-    prompt: Optional[ResponsePrompt] = None
-    reasoning: Optional[Reasoning] = None
+        | None
+    ) = None
+    input: str | list[ResponseInputOutputItem]
+    instructions: str | None = None
+    max_output_tokens: int | None = None
+    max_tool_calls: int | None = None
+    metadata: Metadata | None = None
+    model: str | None = None
+    parallel_tool_calls: bool | None = True
+    previous_response_id: str | None = None
+    prompt: ResponsePrompt | None = None
+    reasoning: Reasoning | None = None
     service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
-    store: Optional[bool] = True
-    stream: Optional[bool] = False
-    temperature: Optional[float] = None
-    text: Optional[ResponseTextConfig] = None
+    store: bool | None = True
+    stream: bool | None = False
+    temperature: float | None = None
+    text: ResponseTextConfig | None = None
     tool_choice: ToolChoice = "auto"
     tools: list[Tool] = Field(default_factory=list)
-    top_logprobs: Optional[int] = 0
-    top_p: Optional[float] = None
-    truncation: Optional[Literal["auto", "disabled"]] = "disabled"
-    user: Optional[str] = None
+    top_logprobs: int | None = 0
+    top_p: float | None = None
+    truncation: Literal["auto", "disabled"] | None = "disabled"
+    user: str | None = None
 
     # --8<-- [start:responses-extra-params]
     request_id: str = Field(
@@ -336,7 +335,7 @@ class ResponsesRequest(OpenAIBaseModel):
             "through out the inference process and return in response."
         ),
     )
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
@@ -348,7 +347,7 @@ class ResponsesRequest(OpenAIBaseModel):
             "if the served model does not use priority scheduling."
         ),
     )
-    cache_salt: Optional[str] = Field(
+    cache_salt: str | None = Field(
         default=None,
         description=(
             "If specified, the prefix cache will be salted with the provided "
@@ -378,7 +377,7 @@ class ResponsesRequest(OpenAIBaseModel):
     def to_sampling_params(
         self,
         default_max_tokens: int,
-        default_sampling_params: Optional[dict] = None,
+        default_sampling_params: dict | None = None,
     ) -> SamplingParams:
         if self.max_output_tokens is None:
             max_tokens = default_max_tokens
@@ -465,58 +464,57 @@ class ChatCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/chat/create
     messages: list[ChatCompletionMessageParam]
-    model: Optional[str] = None
-    frequency_penalty: Optional[float] = 0.0
-    logit_bias: Optional[dict[str, float]] = None
-    logprobs: Optional[bool] = False
-    top_logprobs: Optional[int] = 0
-    max_tokens: Optional[int] = Field(
+    model: str | None = None
+    frequency_penalty: float | None = 0.0
+    logit_bias: dict[str, float] | None = None
+    logprobs: bool | None = False
+    top_logprobs: int | None = 0
+    max_tokens: int | None = Field(
         default=None,
         deprecated="max_tokens is deprecated in favor of "
         "the max_completion_tokens field",
     )
-    max_completion_tokens: Optional[int] = None
-    n: Optional[int] = 1
-    presence_penalty: Optional[float] = 0.0
-    response_format: Optional[AnyResponseFormat] = None
-    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    stop: Optional[Union[str, list[str]]] = []
-    stream: Optional[bool] = False
-    stream_options: Optional[StreamOptions] = None
-    temperature: Optional[float] = None
-    top_p: Optional[float] = None
-    tools: Optional[list[ChatCompletionToolsParam]] = None
-    tool_choice: Optional[
-        Union[
-            Literal["none"],
-            Literal["auto"],
-            Literal["required"],
-            ChatCompletionNamedToolChoiceParam,
-        ]
-    ] = "none"
-    reasoning_effort: Optional[Literal["low", "medium", "high"]] = None
+    max_completion_tokens: int | None = None
+    n: int | None = 1
+    presence_penalty: float | None = 0.0
+    response_format: AnyResponseFormat | None = None
+    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    stop: str | list[str] | None = []
+    stream: bool | None = False
+    stream_options: StreamOptions | None = None
+    temperature: float | None = None
+    top_p: float | None = None
+    tools: list[ChatCompletionToolsParam] | None = None
+    tool_choice: (
+        Literal["none"]
+        | Literal["auto"]
+        | Literal["required"]
+        | ChatCompletionNamedToolChoiceParam
+        | None
+    ) = "none"
+    reasoning_effort: Literal["low", "medium", "high"] | None = None
     include_reasoning: bool = True
 
     # NOTE this will be ignored by vLLM -- the model determines the behavior
-    parallel_tool_calls: Optional[bool] = False
-    user: Optional[str] = None
+    parallel_tool_calls: bool | None = False
+    user: str | None = None
 
     # --8<-- [start:chat-completion-sampling-params]
-    best_of: Optional[int] = None
+    best_of: int | None = None
     use_beam_search: bool = False
-    top_k: Optional[int] = None
-    min_p: Optional[float] = None
-    repetition_penalty: Optional[float] = None
+    top_k: int | None = None
+    min_p: float | None = None
+    repetition_penalty: float | None = None
     length_penalty: float = 1.0
-    stop_token_ids: Optional[list[int]] = []
+    stop_token_ids: list[int] | None = []
     include_stop_str_in_output: bool = False
     ignore_eos: bool = False
     min_tokens: int = 0
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
-    prompt_logprobs: Optional[int] = None
-    allowed_token_ids: Optional[list[int]] = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    prompt_logprobs: int | None = None
+    allowed_token_ids: list[int] | None = None
     bad_words: list[str] = Field(default_factory=list)
     # --8<-- [end:chat-completion-sampling-params]
 
@@ -556,7 +554,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "default)."
         ),
     )
-    documents: Optional[list[dict[str, str]]] = Field(
+    documents: list[dict[str, str]] | None = Field(
         default=None,
         description=(
             "A list of dicts representing documents that will be accessible to "
@@ -566,7 +564,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             '"title" and "text" keys.'
         ),
     )
-    chat_template: Optional[str] = Field(
+    chat_template: str | None = Field(
         default=None,
         description=(
             "A Jinja template to use for this conversion. "
@@ -575,22 +573,22 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "does not define one."
         ),
     )
-    chat_template_kwargs: Optional[dict[str, Any]] = Field(
+    chat_template_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=(
             "Additional keyword args to pass to the template renderer. "
             "Will be accessible by the chat template."
         ),
     )
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
-    structured_outputs: Optional[StructuredOutputsParams] = Field(
+    structured_outputs: StructuredOutputsParams | None = Field(
         default=None,
         description="Additional kwargs for structured outputs",
     )
-    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+    guided_json: str | dict | BaseModel | None = Field(
         default=None,
         description=(
             "`guided_json` is deprecated. "
@@ -598,7 +596,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "Please pass `json` to `structured_outputs` instead."
         ),
     )
-    guided_regex: Optional[str] = Field(
+    guided_regex: str | None = Field(
         default=None,
         description=(
             "`guided_regex` is deprecated. "
@@ -606,7 +604,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "Please pass `regex` to `structured_outputs` instead."
         ),
     )
-    guided_choice: Optional[list[str]] = Field(
+    guided_choice: list[str] | None = Field(
         default=None,
         description=(
             "`guided_choice` is deprecated. "
@@ -614,7 +612,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "Please pass `choice` to `structured_outputs` instead."
         ),
     )
-    guided_grammar: Optional[str] = Field(
+    guided_grammar: str | None = Field(
         default=None,
         description=(
             "`guided_grammar` is deprecated. "
@@ -622,7 +620,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "Please pass `grammar` to `structured_outputs` instead."
         ),
     )
-    structural_tag: Optional[str] = Field(
+    structural_tag: str | None = Field(
         default=None,
         description=(
             "`structural_tag` is deprecated. "
@@ -630,7 +628,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "Please pass `structural_tag` to `structured_outputs` instead."
         ),
     )
-    guided_decoding_backend: Optional[str] = Field(
+    guided_decoding_backend: str | None = Field(
         default=None,
         description=(
             "`guided_decoding_backend` is deprecated. "
@@ -638,7 +636,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "Please remove it from your request."
         ),
     )
-    guided_whitespace_pattern: Optional[str] = Field(
+    guided_whitespace_pattern: str | None = Field(
         default=None,
         description=(
             "`guided_whitespace_pattern` is deprecated. "
@@ -662,7 +660,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "through out the inference process and return in response."
         ),
     )
-    logits_processors: Optional[LogitsProcessors] = Field(
+    logits_processors: LogitsProcessors | None = Field(
         default=None,
         description=(
             "A list of either qualified names of logits processors, or "
@@ -675,7 +673,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "{'param': 'value'}}."
         ),
     )
-    return_tokens_as_token_ids: Optional[bool] = Field(
+    return_tokens_as_token_ids: bool | None = Field(
         default=None,
         description=(
             "If specified with 'logprobs', tokens are represented "
@@ -683,7 +681,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "that are not JSON-encodable can be identified."
         ),
     )
-    return_token_ids: Optional[bool] = Field(
+    return_token_ids: bool | None = Field(
         default=None,
         description=(
             "If specified, the result will include token IDs alongside the "
@@ -693,7 +691,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "need to map generated text back to input tokens."
         ),
     )
-    cache_salt: Optional[str] = Field(
+    cache_salt: str | None = Field(
         default=None,
         description=(
             "If specified, the prefix cache will be salted with the provided "
@@ -704,12 +702,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "to 256 bit). Not supported by vLLM engine V0."
         ),
     )
-    kv_transfer_params: Optional[dict[str, Any]] = Field(
+    kv_transfer_params: dict[str, Any] | None = Field(
         default=None,
         description="KVTransfer parameters used for disaggregated serving.",
     )
 
-    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+    vllm_xargs: dict[str, str | int | float] | None = Field(
         default=None,
         description=(
             "Additional request parameters with string or "
@@ -749,7 +747,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     def to_sampling_params(
         self,
         max_tokens: int,
-        logits_processor_pattern: Optional[str],
+        logits_processor_pattern: str | None,
         default_sampling_params: dict,
     ) -> SamplingParams:
         # Default parameters
@@ -860,7 +858,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             extra_args=extra_args or None,
         )
 
-    def _get_json_schema_from_tool(self) -> Optional[Union[str, dict]]:
+    def _get_json_schema_from_tool(self) -> str | dict | None:
         # user has chosen to not use any tool
         if self.tool_choice == "none" or self.tools is None:
             return None
@@ -1098,44 +1096,44 @@ class ChatCompletionRequest(OpenAIBaseModel):
 class CompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/completions/create
-    model: Optional[str] = None
-    prompt: Optional[Union[list[int], list[list[int]], str, list[str]]] = None
-    best_of: Optional[int] = None
-    echo: Optional[bool] = False
-    frequency_penalty: Optional[float] = 0.0
-    logit_bias: Optional[dict[str, float]] = None
-    logprobs: Optional[int] = None
-    max_tokens: Optional[int] = 16
+    model: str | None = None
+    prompt: list[int] | list[list[int]] | str | list[str] | None = None
+    best_of: int | None = None
+    echo: bool | None = False
+    frequency_penalty: float | None = 0.0
+    logit_bias: dict[str, float] | None = None
+    logprobs: int | None = None
+    max_tokens: int | None = 16
     n: int = 1
-    presence_penalty: Optional[float] = 0.0
-    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    stop: Optional[Union[str, list[str]]] = []
-    stream: Optional[bool] = False
-    stream_options: Optional[StreamOptions] = None
-    suffix: Optional[str] = None
-    temperature: Optional[float] = None
-    top_p: Optional[float] = None
-    user: Optional[str] = None
+    presence_penalty: float | None = 0.0
+    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    stop: str | list[str] | None = []
+    stream: bool | None = False
+    stream_options: StreamOptions | None = None
+    suffix: str | None = None
+    temperature: float | None = None
+    top_p: float | None = None
+    user: str | None = None
 
     # --8<-- [start:completion-sampling-params]
     use_beam_search: bool = False
-    top_k: Optional[int] = None
-    min_p: Optional[float] = None
-    repetition_penalty: Optional[float] = None
+    top_k: int | None = None
+    min_p: float | None = None
+    repetition_penalty: float | None = None
     length_penalty: float = 1.0
-    stop_token_ids: Optional[list[int]] = []
+    stop_token_ids: list[int] | None = []
     include_stop_str_in_output: bool = False
     ignore_eos: bool = False
     min_tokens: int = 0
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
-    allowed_token_ids: Optional[list[int]] = None
-    prompt_logprobs: Optional[int] = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    allowed_token_ids: list[int] | None = None
+    prompt_logprobs: int | None = None
     # --8<-- [end:completion-sampling-params]
 
     # --8<-- [start:completion-extra-params]
-    prompt_embeds: Optional[Union[bytes, list[bytes]]] = None
+    prompt_embeds: bytes | list[bytes] | None = None
     add_special_tokens: bool = Field(
         default=True,
         description=(
@@ -1143,7 +1141,7 @@ class CompletionRequest(OpenAIBaseModel):
             "the prompt."
         ),
     )
-    response_format: Optional[AnyResponseFormat] = Field(
+    response_format: AnyResponseFormat | None = Field(
         default=None,
         description=(
             "Similar to chat completion, this parameter specifies the format "
@@ -1151,11 +1149,11 @@ class CompletionRequest(OpenAIBaseModel):
             ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
         ),
     )
-    structured_outputs: Optional[StructuredOutputsParams] = Field(
+    structured_outputs: StructuredOutputsParams | None = Field(
         default=None,
         description="Additional kwargs for structured outputs",
     )
-    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+    guided_json: str | dict | BaseModel | None = Field(
         default=None,
         description=(
             "`guided_json` is deprecated. "
@@ -1163,7 +1161,7 @@ class CompletionRequest(OpenAIBaseModel):
             "Please pass `json` to `structured_outputs` instead."
         ),
     )
-    guided_regex: Optional[str] = Field(
+    guided_regex: str | None = Field(
         default=None,
         description=(
             "`guided_regex` is deprecated. "
@@ -1171,7 +1169,7 @@ class CompletionRequest(OpenAIBaseModel):
             "Please pass `regex` to `structured_outputs` instead."
         ),
     )
-    guided_choice: Optional[list[str]] = Field(
+    guided_choice: list[str] | None = Field(
         default=None,
         description=(
             "`guided_choice` is deprecated. "
@@ -1179,7 +1177,7 @@ class CompletionRequest(OpenAIBaseModel):
             "Please pass `choice` to `structured_outputs` instead."
         ),
     )
-    guided_grammar: Optional[str] = Field(
+    guided_grammar: str | None = Field(
         default=None,
         description=(
             "`guided_grammar` is deprecated. "
@@ -1187,7 +1185,7 @@ class CompletionRequest(OpenAIBaseModel):
             "Please pass `grammar` to `structured_outputs` instead."
         ),
     )
-    guided_decoding_backend: Optional[str] = Field(
+    guided_decoding_backend: str | None = Field(
         default=None,
         description=(
             "`guided_decoding_backend` is deprecated. "
@@ -1195,7 +1193,7 @@ class CompletionRequest(OpenAIBaseModel):
             "Please remove it from your request."
         ),
     )
-    guided_whitespace_pattern: Optional[str] = Field(
+    guided_whitespace_pattern: str | None = Field(
         default=None,
         description=(
             "`guided_whitespace_pattern` is deprecated. "
@@ -1219,7 +1217,7 @@ class CompletionRequest(OpenAIBaseModel):
             "through out the inference process and return in response."
         ),
     )
-    logits_processors: Optional[LogitsProcessors] = Field(
+    logits_processors: LogitsProcessors | None = Field(
         default=None,
         description=(
             "A list of either qualified names of logits processors, or "
@@ -1233,7 +1231,7 @@ class CompletionRequest(OpenAIBaseModel):
         ),
     )
 
-    return_tokens_as_token_ids: Optional[bool] = Field(
+    return_tokens_as_token_ids: bool | None = Field(
         default=None,
         description=(
             "If specified with 'logprobs', tokens are represented "
@@ -1241,7 +1239,7 @@ class CompletionRequest(OpenAIBaseModel):
             "that are not JSON-encodable can be identified."
         ),
     )
-    return_token_ids: Optional[bool] = Field(
+    return_token_ids: bool | None = Field(
         default=None,
         description=(
             "If specified, the result will include token IDs alongside the "
@@ -1252,7 +1250,7 @@ class CompletionRequest(OpenAIBaseModel):
         ),
     )
 
-    cache_salt: Optional[str] = Field(
+    cache_salt: str | None = Field(
         default=None,
         description=(
             "If specified, the prefix cache will be salted with the provided "
@@ -1264,12 +1262,12 @@ class CompletionRequest(OpenAIBaseModel):
         ),
     )
 
-    kv_transfer_params: Optional[dict[str, Any]] = Field(
+    kv_transfer_params: dict[str, Any] | None = Field(
         default=None,
         description="KVTransfer parameters used for disaggregated serving.",
     )
 
-    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+    vllm_xargs: dict[str, str | int | float] | None = Field(
         default=None,
         description=(
             "Additional request parameters with string or "
@@ -1291,7 +1289,7 @@ class CompletionRequest(OpenAIBaseModel):
     def to_beam_search_params(
         self,
         max_tokens: int,
-        default_sampling_params: Optional[dict] = None,
+        default_sampling_params: dict | None = None,
     ) -> BeamSearchParams:
         if default_sampling_params is None:
             default_sampling_params = {}
@@ -1312,8 +1310,8 @@ class CompletionRequest(OpenAIBaseModel):
     def to_sampling_params(
         self,
         max_tokens: int,
-        logits_processor_pattern: Optional[str],
-        default_sampling_params: Optional[dict] = None,
+        logits_processor_pattern: str | None,
+        default_sampling_params: dict | None = None,
     ) -> SamplingParams:
         if default_sampling_params is None:
             default_sampling_params = {}
@@ -1488,12 +1486,12 @@ class CompletionRequest(OpenAIBaseModel):
 class EmbeddingCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/embeddings
-    model: Optional[str] = None
-    input: Union[list[int], list[list[int]], str, list[str]]
+    model: str | None = None
+    input: list[int] | list[list[int]] | str | list[str]
     encoding_format: Literal["float", "base64"] = "float"
-    dimensions: Optional[int] = None
-    user: Optional[str] = None
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
+    dimensions: int | None = None
+    user: str | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
 
     # --8<-- [start:embedding-extra-params]
     add_special_tokens: bool = Field(
@@ -1519,7 +1517,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
             "through out the inference process and return in response."
         ),
     )
-    normalize: Optional[bool] = None
+    normalize: bool | None = None
 
     # --8<-- [end:embedding-extra-params]
 
@@ -1532,13 +1530,13 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
 
 
 class EmbeddingChatRequest(OpenAIBaseModel):
-    model: Optional[str] = None
+    model: str | None = None
     messages: list[ChatCompletionMessageParam]
 
     encoding_format: Literal["float", "base64"] = "float"
-    dimensions: Optional[int] = None
-    user: Optional[str] = None
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
+    dimensions: int | None = None
+    user: str | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
 
     # --8<-- [start:chat-embedding-extra-params]
     add_generation_prompt: bool = Field(
@@ -1560,7 +1558,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
             "default)."
         ),
     )
-    chat_template: Optional[str] = Field(
+    chat_template: str | None = Field(
         default=None,
         description=(
             "A Jinja template to use for this conversion. "
@@ -1569,14 +1567,14 @@ class EmbeddingChatRequest(OpenAIBaseModel):
             "does not define one."
         ),
     )
-    chat_template_kwargs: Optional[dict[str, Any]] = Field(
+    chat_template_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=(
             "Additional keyword args to pass to the template renderer. "
             "Will be accessible by the chat template."
         ),
     )
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
@@ -1596,7 +1594,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
             "through out the inference process and return in response."
         ),
     )
-    normalize: Optional[bool] = None
+    normalize: bool | None = None
     # --8<-- [end:chat-embedding-extra-params]
 
     @model_validator(mode="before")
@@ -1617,7 +1615,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         )
 
 
-EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
+EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
 
 PoolingCompletionRequest = EmbeddingCompletionRequest
 PoolingChatRequest = EmbeddingChatRequest
@@ -1626,7 +1624,7 @@ T = TypeVar("T")
 
 
 class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
-    model: Optional[str] = None
+    model: str | None = None
 
     priority: int = Field(default=0)
     """
@@ -1646,7 +1644,7 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
 
 
 class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
-    request_id: Optional[str] = None
+    request_id: str | None = None
     """
     The request_id associated with this response
     """
@@ -1659,18 +1657,20 @@ class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
     """
 
 
-PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest, IOProcessorRequest]
+PoolingRequest: TypeAlias = (
+    PoolingCompletionRequest | PoolingChatRequest | IOProcessorRequest
+)
 
 
 class ScoreRequest(OpenAIBaseModel):
-    model: Optional[str] = None
-    text_1: Union[list[str], str, ScoreMultiModalParam]
-    text_2: Union[list[str], str, ScoreMultiModalParam]
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
+    model: str | None = None
+    text_1: list[str] | str | ScoreMultiModalParam
+    text_2: list[str] | str | ScoreMultiModalParam
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
 
     # --8<-- [start:score-extra-params]
 
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
@@ -1684,7 +1684,7 @@ class ScoreRequest(OpenAIBaseModel):
         ),
     )
 
-    activation: Optional[bool] = None
+    activation: bool | None = None
 
     # --8<-- [end:score-extra-params]
 
@@ -1696,15 +1696,15 @@ class ScoreRequest(OpenAIBaseModel):
 
 
 class RerankRequest(OpenAIBaseModel):
-    model: Optional[str] = None
-    query: Union[str, ScoreMultiModalParam]
-    documents: Union[list[str], ScoreMultiModalParam]
+    model: str | None = None
+    query: str | ScoreMultiModalParam
+    documents: list[str] | ScoreMultiModalParam
     top_n: int = Field(default_factory=lambda: 0)
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
 
     # --8<-- [start:rerank-extra-params]
 
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
@@ -1718,7 +1718,7 @@ class RerankRequest(OpenAIBaseModel):
         ),
     )
 
-    activation: Optional[bool] = None
+    activation: bool | None = None
 
     # --8<-- [end:rerank-extra-params]
 
@@ -1730,8 +1730,8 @@ class RerankRequest(OpenAIBaseModel):
 
 
 class RerankDocument(BaseModel):
-    text: Optional[str] = None
-    multi_modal: Optional[ScoreContentPartParam] = None
+    text: str | None = None
+    multi_modal: ScoreContentPartParam | None = None
 
 
 class RerankResult(BaseModel):
@@ -1753,17 +1753,17 @@ class RerankResponse(OpenAIBaseModel):
 
 class CompletionLogProbs(OpenAIBaseModel):
     text_offset: list[int] = Field(default_factory=list)
-    token_logprobs: list[Optional[float]] = Field(default_factory=list)
+    token_logprobs: list[float | None] = Field(default_factory=list)
     tokens: list[str] = Field(default_factory=list)
-    top_logprobs: list[Optional[dict[str, float]]] = Field(default_factory=list)
+    top_logprobs: list[dict[str, float] | None] = Field(default_factory=list)
 
 
 class CompletionResponseChoice(OpenAIBaseModel):
     index: int
     text: str
-    logprobs: Optional[CompletionLogProbs] = None
-    finish_reason: Optional[str] = None
-    stop_reason: Optional[Union[int, str]] = Field(
+    logprobs: CompletionLogProbs | None = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = Field(
         default=None,
         description=(
             "The stop string or token id that caused the completion "
@@ -1771,9 +1771,9 @@ class CompletionResponseChoice(OpenAIBaseModel):
             "including encountering the EOS token"
         ),
     )
-    token_ids: Optional[list[int]] = None  # For response
-    prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
-    prompt_token_ids: Optional[list[int]] = None  # For prompt
+    token_ids: list[int] | None = None  # For response
+    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
+    prompt_token_ids: list[int] | None = None  # For prompt
 
 
 class CompletionResponse(OpenAIBaseModel):
@@ -1782,14 +1782,12 @@ class CompletionResponse(OpenAIBaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: list[CompletionResponseChoice]
-    service_tier: Optional[Literal["auto", "default", "flex", "scale", "priority"]] = (
-        None
-    )
-    system_fingerprint: Optional[str] = None
+    service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
+    system_fingerprint: str | None = None
     usage: UsageInfo
 
     # vLLM-specific fields that are not in OpenAI spec
-    kv_transfer_params: Optional[dict[str, Any]] = Field(
+    kv_transfer_params: dict[str, Any] | None = Field(
         default=None, description="KVTransfer parameters."
     )
 
@@ -1797,9 +1795,9 @@ class CompletionResponse(OpenAIBaseModel):
 class CompletionResponseStreamChoice(OpenAIBaseModel):
     index: int
     text: str
-    logprobs: Optional[CompletionLogProbs] = None
-    finish_reason: Optional[str] = None
-    stop_reason: Optional[Union[int, str]] = Field(
+    logprobs: CompletionLogProbs | None = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = Field(
         default=None,
         description=(
             "The stop string or token id that caused the completion "
@@ -1809,8 +1807,8 @@ class CompletionResponseStreamChoice(OpenAIBaseModel):
     )
     # not part of the OpenAI spec but for tracing the tokens
     # prompt tokens is put into choice to align with CompletionResponseChoice
-    prompt_token_ids: Optional[list[int]] = None
-    token_ids: Optional[list[int]] = None
+    prompt_token_ids: list[int] | None = None
+    token_ids: list[int] | None = None
 
 
 class CompletionStreamResponse(OpenAIBaseModel):
@@ -1819,13 +1817,13 @@ class CompletionStreamResponse(OpenAIBaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: list[CompletionResponseStreamChoice]
-    usage: Optional[UsageInfo] = Field(default=None)
+    usage: UsageInfo | None = Field(default=None)
 
 
 class EmbeddingResponseData(OpenAIBaseModel):
     index: int
     object: str = "embedding"
-    embedding: Union[list[float], str]
+    embedding: list[float] | str
 
 
 class EmbeddingResponse(OpenAIBaseModel):
@@ -1840,7 +1838,7 @@ class EmbeddingResponse(OpenAIBaseModel):
 class PoolingResponseData(OpenAIBaseModel):
     index: int
     object: str = "pooling"
-    data: Union[list[list[float]], list[float], str]
+    data: list[list[float]] | list[float] | str
 
 
 class PoolingResponse(OpenAIBaseModel):
@@ -1868,10 +1866,10 @@ class ScoreResponse(OpenAIBaseModel):
 
 
 class ClassificationRequest(OpenAIBaseModel):
-    model: Optional[str] = None
-    input: Union[list[str], str]
-    truncate_prompt_tokens: Optional[int] = None
-    user: Optional[str] = None
+    model: str | None = None
+    input: list[str] | str
+    truncate_prompt_tokens: int | None = None
+    user: str | None = None
 
     # --8<-- [start:classification-extra-params]
     priority: int = Field(
@@ -1883,7 +1881,7 @@ class ClassificationRequest(OpenAIBaseModel):
         ),
     )
 
-    activation: Optional[bool] = None
+    activation: bool | None = None
 
     # --8<-- [end:classification-extra-params]
 
@@ -1896,7 +1894,7 @@ class ClassificationRequest(OpenAIBaseModel):
 
 class ClassificationData(OpenAIBaseModel):
     index: int
-    label: Optional[str]
+    label: str | None
     probs: list[float]
     num_classes: int
 
@@ -1922,16 +1920,16 @@ class ToolCall(OpenAIBaseModel):
 
 
 class DeltaFunctionCall(BaseModel):
-    name: Optional[str] = None
-    arguments: Optional[str] = None
+    name: str | None = None
+    arguments: str | None = None
 
 
 # a tool call delta where everything is optional
 class DeltaToolCall(OpenAIBaseModel):
-    id: Optional[str] = None
-    type: Optional[Literal["function"]] = None
+    id: str | None = None
+    type: Literal["function"] | None = None
     index: int
-    function: Optional[DeltaFunctionCall] = None
+    function: DeltaFunctionCall | None = None
 
 
 class ExtractedToolCallInformation(BaseModel):
@@ -1943,50 +1941,50 @@ class ExtractedToolCallInformation(BaseModel):
 
     # content - per OpenAI spec, content AND tool calls can be returned rarely
     # But some models will do this intentionally
-    content: Optional[str] = None
+    content: str | None = None
 
 
 class ChatMessage(OpenAIBaseModel):
     role: str
-    content: Optional[str] = None
-    refusal: Optional[str] = None
-    annotations: Optional[OpenAIAnnotation] = None
-    audio: Optional[OpenAIChatCompletionAudio] = None
-    function_call: Optional[FunctionCall] = None
+    content: str | None = None
+    refusal: str | None = None
+    annotations: OpenAIAnnotation | None = None
+    audio: OpenAIChatCompletionAudio | None = None
+    function_call: FunctionCall | None = None
     tool_calls: list[ToolCall] = Field(default_factory=list)
 
     # vLLM-specific fields that are not in OpenAI spec
-    reasoning_content: Optional[str] = None
+    reasoning_content: str | None = None
 
 
 class ChatCompletionLogProb(OpenAIBaseModel):
     token: str
     logprob: float = -9999.0
-    bytes: Optional[list[int]] = None
+    bytes: list[int] | None = None
 
 
 class ChatCompletionLogProbsContent(ChatCompletionLogProb):
     # Workaround: redefine fields name cache so that it's not
     # shared with the super class.
-    field_names: ClassVar[Optional[set[str]]] = None
+    field_names: ClassVar[set[str] | None] = None
     top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
 
 
 class ChatCompletionLogProbs(OpenAIBaseModel):
-    content: Optional[list[ChatCompletionLogProbsContent]] = None
+    content: list[ChatCompletionLogProbsContent] | None = None
 
 
 class ChatCompletionResponseChoice(OpenAIBaseModel):
     index: int
     message: ChatMessage
-    logprobs: Optional[ChatCompletionLogProbs] = None
+    logprobs: ChatCompletionLogProbs | None = None
     # per OpenAI spec this is the default
-    finish_reason: Optional[str] = "stop"
+    finish_reason: str | None = "stop"
     # not part of the OpenAI spec but included in vLLM for legacy reasons
-    stop_reason: Optional[Union[int, str]] = None
+    stop_reason: int | str | None = None
     # not part of the OpenAI spec but is useful for tracing the tokens
     # in agent scenarios
-    token_ids: Optional[list[int]] = None
+    token_ids: list[int] | None = None
 
 
 class ChatCompletionResponse(OpenAIBaseModel):
@@ -1995,35 +1993,33 @@ class ChatCompletionResponse(OpenAIBaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: list[ChatCompletionResponseChoice]
-    service_tier: Optional[Literal["auto", "default", "flex", "scale", "priority"]] = (
-        None
-    )
-    system_fingerprint: Optional[str] = None
+    service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
+    system_fingerprint: str | None = None
     usage: UsageInfo
 
     # vLLM-specific fields that are not in OpenAI spec
-    prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
-    prompt_token_ids: Optional[list[int]] = None
-    kv_transfer_params: Optional[dict[str, Any]] = Field(
+    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
+    prompt_token_ids: list[int] | None = None
+    kv_transfer_params: dict[str, Any] | None = Field(
         default=None, description="KVTransfer parameters."
     )
 
 
 class DeltaMessage(OpenAIBaseModel):
-    role: Optional[str] = None
-    content: Optional[str] = None
-    reasoning_content: Optional[str] = None
+    role: str | None = None
+    content: str | None = None
+    reasoning_content: str | None = None
     tool_calls: list[DeltaToolCall] = Field(default_factory=list)
 
 
 class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
     index: int
     delta: DeltaMessage
-    logprobs: Optional[ChatCompletionLogProbs] = None
-    finish_reason: Optional[str] = None
-    stop_reason: Optional[Union[int, str]] = None
+    logprobs: ChatCompletionLogProbs | None = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
     # not part of the OpenAI spec but for tracing the tokens
-    token_ids: Optional[list[int]] = None
+    token_ids: list[int] | None = None
 
 
 class ChatCompletionStreamResponse(OpenAIBaseModel):
@@ -2032,15 +2028,15 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: list[ChatCompletionResponseStreamChoice]
-    usage: Optional[UsageInfo] = Field(default=None)
+    usage: UsageInfo | None = Field(default=None)
     # not part of the OpenAI spec but for tracing the tokens
-    prompt_token_ids: Optional[list[int]] = None
+    prompt_token_ids: list[int] | None = None
 
 
 class TranscriptionResponseStreamChoice(OpenAIBaseModel):
     delta: DeltaMessage
-    finish_reason: Optional[str] = None
-    stop_reason: Optional[Union[int, str]] = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
 
 
 class TranscriptionStreamResponse(OpenAIBaseModel):
@@ -2049,7 +2045,7 @@ class TranscriptionStreamResponse(OpenAIBaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: list[TranscriptionResponseStreamChoice]
-    usage: Optional[UsageInfo] = Field(default=None)
+    usage: UsageInfo | None = Field(default=None)
 
 
 class InputTokensDetails(OpenAIBaseModel):
@@ -2073,9 +2069,9 @@ class ResponsesResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
     created_at: int = Field(default_factory=lambda: int(time.time()))
     # error: Optional[ResponseError] = None
-    incomplete_details: Optional[IncompleteDetails] = None
-    instructions: Optional[str] = None
-    metadata: Optional[Metadata] = None
+    incomplete_details: IncompleteDetails | None = None
+    instructions: str | None = None
+    metadata: Metadata | None = None
     model: str
     object: Literal["response"] = "response"
     output: list[ResponseOutputItem]
@@ -2086,24 +2082,24 @@ class ResponsesResponse(OpenAIBaseModel):
     top_p: float
     background: bool
     max_output_tokens: int
-    max_tool_calls: Optional[int] = None
-    previous_response_id: Optional[str] = None
-    prompt: Optional[ResponsePrompt] = None
-    reasoning: Optional[Reasoning] = None
+    max_tool_calls: int | None = None
+    previous_response_id: str | None = None
+    prompt: ResponsePrompt | None = None
+    reasoning: Reasoning | None = None
     service_tier: Literal["auto", "default", "flex", "scale", "priority"]
     status: ResponseStatus
-    text: Optional[ResponseTextConfig] = None
-    top_logprobs: Optional[int] = None
+    text: ResponseTextConfig | None = None
+    top_logprobs: int | None = None
     truncation: Literal["auto", "disabled"]
-    usage: Optional[ResponseUsage] = None
-    user: Optional[str] = None
+    usage: ResponseUsage | None = None
+    user: str | None = None
 
     # --8<-- [start:responses-extra-params]
     # These are populated when enable_response_messages is set to True
     # NOTE: custom serialization is needed
     # see serialize_input_messages and serialize_output_messages
-    input_messages: Optional[list[ChatCompletionMessageParam]] = None
-    output_messages: Optional[list[ChatCompletionMessageParam]] = None
+    input_messages: list[ChatCompletionMessageParam] | None = None
+    output_messages: list[ChatCompletionMessageParam] | None = None
     # --8<-- [end:responses-extra-params]
 
     # NOTE: openAI harmony doesn't serialize TextContent properly,
@@ -2150,11 +2146,11 @@ class ResponsesResponse(OpenAIBaseModel):
         created_time: int,
         output: list[ResponseOutputItem],
         status: ResponseStatus,
-        usage: Optional[ResponseUsage] = None,
-        input_messages: Optional[list[ChatCompletionMessageParam]] = None,
-        output_messages: Optional[list[ChatCompletionMessageParam]] = None,
+        usage: ResponseUsage | None = None,
+        input_messages: list[ChatCompletionMessageParam] | None = None,
+        output_messages: list[ChatCompletionMessageParam] | None = None,
     ) -> "ResponsesResponse":
-        incomplete_details: Optional[IncompleteDetails] = None
+        incomplete_details: IncompleteDetails | None = None
         if status == "incomplete":
             incomplete_details = IncompleteDetails(reason="max_output_tokens")
         # TODO: implement the other reason for incomplete_details,
@@ -2249,31 +2245,31 @@ class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
     response: ResponsesResponse  # type: ignore[override]
 
 
-StreamingResponsesResponse: TypeAlias = Union[
-    "ResponseCreatedEvent",
-    "ResponseInProgressEvent",
-    "ResponseCompletedEvent",
-    ResponseOutputItemAddedEvent,
-    ResponseOutputItemDoneEvent,
-    ResponseContentPartAddedEvent,
-    ResponseContentPartDoneEvent,
-    ResponseReasoningTextDeltaEvent,
-    ResponseReasoningTextDoneEvent,
-    ResponseReasoningPartAddedEvent,
-    ResponseReasoningPartDoneEvent,
-    ResponseCodeInterpreterCallInProgressEvent,
-    ResponseCodeInterpreterCallCodeDeltaEvent,
-    ResponseWebSearchCallInProgressEvent,
-    ResponseWebSearchCallSearchingEvent,
-    ResponseWebSearchCallCompletedEvent,
-    ResponseCodeInterpreterCallCodeDoneEvent,
-    ResponseCodeInterpreterCallInterpretingEvent,
-    ResponseCodeInterpreterCallCompletedEvent,
-]
+StreamingResponsesResponse: TypeAlias = (
+    ResponseCreatedEvent
+    | ResponseInProgressEvent
+    | ResponseCompletedEvent
+    | ResponseOutputItemAddedEvent
+    | ResponseOutputItemDoneEvent
+    | ResponseContentPartAddedEvent
+    | ResponseContentPartDoneEvent
+    | ResponseReasoningTextDeltaEvent
+    | ResponseReasoningTextDoneEvent
+    | ResponseReasoningPartAddedEvent
+    | ResponseReasoningPartDoneEvent
+    | ResponseCodeInterpreterCallInProgressEvent
+    | ResponseCodeInterpreterCallCodeDeltaEvent
+    | ResponseWebSearchCallInProgressEvent
+    | ResponseWebSearchCallSearchingEvent
+    | ResponseWebSearchCallCompletedEvent
+    | ResponseCodeInterpreterCallCodeDoneEvent
+    | ResponseCodeInterpreterCallInterpretingEvent
+    | ResponseCodeInterpreterCallCompletedEvent
+)
 
-BatchRequestInputBody = Union[
-    ChatCompletionRequest, EmbeddingRequest, ScoreRequest, RerankRequest
-]
+BatchRequestInputBody: TypeAlias = (
+    ChatCompletionRequest | EmbeddingRequest | ScoreRequest | RerankRequest
+)
 
 
 class BatchRequestInput(OpenAIBaseModel):
@@ -2322,9 +2318,13 @@ class BatchResponseData(OpenAIBaseModel):
     request_id: str
 
     # The body of the response.
-    body: Optional[
-        Union[ChatCompletionResponse, EmbeddingResponse, ScoreResponse, RerankResponse]
-    ] = None
+    body: (
+        ChatCompletionResponse
+        | EmbeddingResponse
+        | ScoreResponse
+        | RerankResponse
+        | None
+    ) = None
 
 
 class BatchRequestOutput(OpenAIBaseModel):
@@ -2338,15 +2338,15 @@ class BatchRequestOutput(OpenAIBaseModel):
     # inputs.
     custom_id: str
 
-    response: Optional[BatchResponseData]
+    response: BatchResponseData | None
 
     # For requests that failed with a non-HTTP error, this will contain more
     # information on the cause of the failure.
-    error: Optional[Any]
+    error: Any | None
 
 
 class TokenizeCompletionRequest(OpenAIBaseModel):
-    model: Optional[str] = None
+    model: str | None = None
     prompt: str
 
     add_special_tokens: bool = Field(
@@ -2356,7 +2356,7 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
             "the prompt."
         ),
     )
-    return_token_strs: Optional[bool] = Field(
+    return_token_strs: bool | None = Field(
         default=False,
         description=(
             "If true, also return the token strings corresponding to the token ids."
@@ -2365,7 +2365,7 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
 
 
 class TokenizeChatRequest(OpenAIBaseModel):
-    model: Optional[str] = None
+    model: str | None = None
     messages: list[ChatCompletionMessageParam]
 
     add_generation_prompt: bool = Field(
@@ -2376,7 +2376,7 @@ class TokenizeChatRequest(OpenAIBaseModel):
             "model."
         ),
     )
-    return_token_strs: Optional[bool] = Field(
+    return_token_strs: bool | None = Field(
         default=False,
         description=(
             "If true, also return the token strings corresponding to the token ids."
@@ -2402,7 +2402,7 @@ class TokenizeChatRequest(OpenAIBaseModel):
             "default)."
         ),
     )
-    chat_template: Optional[str] = Field(
+    chat_template: str | None = Field(
         default=None,
         description=(
             "A Jinja template to use for this conversion. "
@@ -2411,18 +2411,18 @@ class TokenizeChatRequest(OpenAIBaseModel):
             "does not define one."
         ),
     )
-    chat_template_kwargs: Optional[dict[str, Any]] = Field(
+    chat_template_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=(
             "Additional keyword args to pass to the template renderer. "
             "Will be accessible by the chat template."
         ),
     )
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
-    tools: Optional[list[ChatCompletionToolsParam]] = Field(
+    tools: list[ChatCompletionToolsParam] | None = Field(
         default=None,
         description=("A list of tools the model may call."),
     )
@@ -2438,18 +2438,18 @@ class TokenizeChatRequest(OpenAIBaseModel):
         return data
 
 
-TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
+TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest
 
 
 class TokenizeResponse(OpenAIBaseModel):
     count: int
     max_model_len: int
     tokens: list[int]
-    token_strs: Optional[list[str]] = None
+    token_strs: list[str] | None = None
 
 
 class DetokenizeRequest(OpenAIBaseModel):
-    model: Optional[str] = None
+    model: str | None = None
     tokens: list[int]
 
 
@@ -2474,7 +2474,7 @@ class LoadLoRAAdapterRequest(BaseModel):
 
 class UnloadLoRAAdapterRequest(BaseModel):
     lora_name: str
-    lora_int_id: Optional[int] = Field(default=None)
+    lora_int_id: int | None = Field(default=None)
 
 
 ## Protocols for Audio
@@ -2491,11 +2491,11 @@ class TranscriptionRequest(OpenAIBaseModel):
     formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
     """
 
-    model: Optional[str] = None
+    model: str | None = None
     """ID of the model to use.
     """
 
-    language: Optional[str] = None
+    language: str | None = None
     """The language of the input audio.
 
     Supplying the input language in
@@ -2530,16 +2530,16 @@ class TranscriptionRequest(OpenAIBaseModel):
     timestamps incurs additional latency.
     """
 
-    stream: Optional[bool] = False
+    stream: bool | None = False
     """When set, it will enable output to be streamed in a similar fashion
     as the Chat Completion endpoint.
     """
     # --8<-- [start:transcription-extra-params]
     # Flattened stream option to simplify form data.
-    stream_include_usage: Optional[bool] = False
-    stream_continuous_usage_stats: Optional[bool] = False
+    stream_include_usage: bool | None = False
+    stream_continuous_usage_stats: bool | None = False
 
-    vllm_xargs: Optional[dict[str, Union[str, int, float]]] = Field(
+    vllm_xargs: dict[str, str | int | float] | None = Field(
         default=None,
         description=(
             "Additional request parameters with string or "
@@ -2548,7 +2548,7 @@ class TranscriptionRequest(OpenAIBaseModel):
     )
     # --8<-- [end:transcription-extra-params]
 
-    to_language: Optional[str] = None
+    to_language: str | None = None
     """The language of the output audio we transcribe to.
 
     Please note that this is not currently used by supported models at this
@@ -2565,29 +2565,29 @@ class TranscriptionRequest(OpenAIBaseModel):
     to automatically increase the temperature until certain thresholds are hit.
     """
 
-    top_p: Optional[float] = None
+    top_p: float | None = None
     """Enables nucleus (top-p) sampling, where tokens are selected from the
     smallest possible set whose cumulative probability exceeds `p`.
     """
 
-    top_k: Optional[int] = None
+    top_k: int | None = None
     """Limits sampling to the `k` most probable tokens at each step."""
 
-    min_p: Optional[float] = None
+    min_p: float | None = None
     """Filters out tokens with a probability lower than `min_p`, ensuring a
     minimum likelihood threshold during sampling.
     """
 
-    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
     """The seed to use for sampling."""
 
-    frequency_penalty: Optional[float] = 0.0
+    frequency_penalty: float | None = 0.0
     """The frequency penalty to use for sampling."""
 
-    repetition_penalty: Optional[float] = None
+    repetition_penalty: float | None = None
     """The repetition penalty to use for sampling."""
 
-    presence_penalty: Optional[float] = 0.0
+    presence_penalty: float | None = 0.0
     """The presence penalty to use for sampling."""
     # --8<-- [end:transcription-sampling-params]
 
@@ -2601,7 +2601,7 @@ class TranscriptionRequest(OpenAIBaseModel):
     }
 
     def to_sampling_params(
-        self, default_max_tokens: int, default_sampling_params: Optional[dict] = None
+        self, default_max_tokens: int, default_sampling_params: dict | None = None
     ) -> SamplingParams:
         max_tokens = default_max_tokens
 
@@ -2740,17 +2740,17 @@ class TranscriptionResponseVerbose(OpenAIBaseModel):
     text: str
     """The transcribed text."""
 
-    segments: Optional[list[TranscriptionSegment]] = None
+    segments: list[TranscriptionSegment] | None = None
     """Segments of the transcribed text and their corresponding details."""
 
-    words: Optional[list[TranscriptionWord]] = None
+    words: list[TranscriptionWord] | None = None
     """Extracted words and their corresponding timestamps."""
 
 
 class TranslationResponseStreamChoice(OpenAIBaseModel):
     delta: DeltaMessage
-    finish_reason: Optional[str] = None
-    stop_reason: Optional[Union[int, str]] = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
 
 
 class TranslationStreamResponse(OpenAIBaseModel):
@@ -2759,7 +2759,7 @@ class TranslationStreamResponse(OpenAIBaseModel):
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: list[TranslationResponseStreamChoice]
-    usage: Optional[UsageInfo] = Field(default=None)
+    usage: UsageInfo | None = Field(default=None)
 
 
 class TranslationRequest(OpenAIBaseModel):
@@ -2772,7 +2772,7 @@ class TranslationRequest(OpenAIBaseModel):
     formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
     """
 
-    model: Optional[str] = None
+    model: str | None = None
     """ID of the model to use.
     """
 
@@ -2792,7 +2792,7 @@ class TranslationRequest(OpenAIBaseModel):
 
     # TODO support additional sampling parameters
     # --8<-- [start:translation-sampling-params]
-    seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
     """The seed to use for sampling."""
 
     temperature: float = Field(default=0.0)
@@ -2806,7 +2806,7 @@ class TranslationRequest(OpenAIBaseModel):
     # --8<-- [end:translation-sampling-params]
 
     # --8<-- [start:translation-extra-params]
-    language: Optional[str] = None
+    language: str | None = None
     """The language of the input audio we translate from.
 
     Supplying the input language in
@@ -2814,7 +2814,7 @@ class TranslationRequest(OpenAIBaseModel):
     will improve accuracy.
     """
 
-    to_language: Optional[str] = None
+    to_language: str | None = None
     """The language of the input audio we translate to.
 
     Please note that this is not supported by all models, refer to the specific
@@ -2822,14 +2822,14 @@ class TranslationRequest(OpenAIBaseModel):
     For instance, Whisper only supports `to_language=en`.
     """
 
-    stream: Optional[bool] = False
+    stream: bool | None = False
     """Custom field not present in the original OpenAI definition. When set,
     it will enable output to be streamed in a similar fashion as the Chat
     Completion endpoint.
     """
     # Flattened stream option to simplify form data.
-    stream_include_usage: Optional[bool] = False
-    stream_continuous_usage_stats: Optional[bool] = False
+    stream_include_usage: bool | None = False
+    stream_continuous_usage_stats: bool | None = False
     # --8<-- [end:translation-extra-params]
 
     # Default sampling parameters for translation requests.
@@ -2838,7 +2838,7 @@ class TranslationRequest(OpenAIBaseModel):
     }
 
     def to_sampling_params(
-        self, default_max_tokens: int, default_sampling_params: Optional[dict] = None
+        self, default_max_tokens: int, default_sampling_params: dict | None = None
     ) -> SamplingParams:
         max_tokens = default_max_tokens
 
@@ -2939,8 +2939,8 @@ class TranslationResponseVerbose(OpenAIBaseModel):
     text: str
     """The translated text."""
 
-    segments: Optional[list[TranslationSegment]] = None
+    segments: list[TranslationSegment] | None = None
     """Segments of the translated text and their corresponding details."""
 
-    words: Optional[list[TranslationWord]] = None
+    words: list[TranslationWord] | None = None
     """Extracted words and their corresponding timestamps."""
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index e394f24f8793f..ecee27a329d22 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -4,10 +4,9 @@
 import asyncio
 import tempfile
 from argparse import Namespace
-from collections.abc import Awaitable
+from collections.abc import Awaitable, Callable
 from http import HTTPStatus
 from io import StringIO
-from typing import Callable, Optional
 
 import aiohttp
 import torch
@@ -124,7 +123,7 @@ _BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elap
 class BatchProgressTracker:
     def __init__(self):
         self._total = 0
-        self._pbar: Optional[tqdm] = None
+        self._pbar: tqdm | None = None
 
     def submitted(self):
         self._total += 1
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 94c24ce9b307a..96525f2068593 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -6,7 +6,7 @@ import json
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
-from typing import Final, Optional, Union
+from typing import Final
 
 import jinja2
 import partial_json_parser
@@ -82,15 +82,15 @@ class OpenAIServingChat(OpenAIServing):
         models: OpenAIServingModels,
         response_role: str,
         *,
-        request_logger: Optional[RequestLogger],
-        chat_template: Optional[str],
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
         trust_request_chat_template: bool = False,
         return_tokens_as_token_ids: bool = False,
         reasoning_parser: str = "",
         enable_auto_tools: bool = False,
         exclude_tools_when_tool_choice_none: bool = False,
-        tool_parser: Optional[str] = None,
+        tool_parser: str | None = None,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
@@ -159,8 +159,8 @@ class OpenAIServingChat(OpenAIServing):
     async def create_chat_completion(
         self,
         request: ChatCompletionRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[AsyncGenerator[str, None], ChatCompletionResponse, ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> AsyncGenerator[str, None] | ChatCompletionResponse | ErrorResponse:
         """
         Chat Completion API similar to OpenAI's API.
 
@@ -281,7 +281,7 @@ class OpenAIServingChat(OpenAIServing):
                     default_sampling_params=self.default_sampling_params,
                 )
 
-                sampling_params: Union[SamplingParams, BeamSearchParams]
+                sampling_params: SamplingParams | BeamSearchParams
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
                         max_tokens, self.default_sampling_params
@@ -416,11 +416,11 @@ class OpenAIServingChat(OpenAIServing):
     def extract_tool_call_required_streaming(
         self,
         previous_text: str,
-        current_text: Optional[str],
+        current_text: str | None,
         delta_text: str,
         function_name_returned: bool,
-        tool_call_idx: Optional[int] = None,
-    ) -> tuple[Optional[DeltaMessage], bool]:
+        tool_call_idx: int | None = None,
+    ) -> tuple[DeltaMessage | None, bool]:
         if current_text is None or current_text == "":
             # if the current text is empty, we cannot parse it
             return None, function_name_returned
@@ -548,7 +548,7 @@ class OpenAIServingChat(OpenAIServing):
             and self._should_stream_with_auto_tool_parsing(request)
         )
 
-        all_previous_token_ids: Optional[list[list[int]]]
+        all_previous_token_ids: list[list[int]] | None
         function_name_returned = [False] * num_choices
         if self.tool_call_id_type == "kimi_k2":
             history_tool_call_cnt = get_history_tool_calls_cnt(conversation)
@@ -583,7 +583,7 @@ class OpenAIServingChat(OpenAIServing):
         # Prepare the tool parser if it's needed
         try:
             if tool_choice_auto and self.tool_parser:
-                tool_parsers: list[Optional[ToolParser]] = [
+                tool_parsers: list[ToolParser | None] = [
                     self.tool_parser(tokenizer)
                 ] * num_choices
             else:
@@ -661,7 +661,7 @@ class OpenAIServingChat(OpenAIServing):
                     # Send response to echo the input portion of the
                     # last message
                     if request.echo:
-                        last_msg_content: Union[str, list[dict[str, str]]] = ""
+                        last_msg_content: str | list[dict[str, str]] = ""
                         if (
                             conversation
                             and "content" in conversation[-1]
@@ -734,7 +734,7 @@ class OpenAIServingChat(OpenAIServing):
                         # Chunked prefill case, don't return empty chunks
                         continue
 
-                    delta_message: Optional[DeltaMessage]
+                    delta_message: DeltaMessage | None
 
                     # just update previous_texts and previous_token_ids
                     if tool_choice_auto or self.reasoning_parser:
@@ -1260,9 +1260,9 @@ class OpenAIServingChat(OpenAIServing):
         conversation: list[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
-    ) -> Union[ErrorResponse, ChatCompletionResponse]:
+    ) -> ErrorResponse | ChatCompletionResponse:
         created_time = int(time.time())
-        final_res: Optional[RequestOutput] = None
+        final_res: RequestOutput | None = None
 
         try:
             async for res in result_generator:
@@ -1512,7 +1512,7 @@ class OpenAIServingChat(OpenAIServing):
             choices.append(choice_data)
 
         if request.echo:
-            last_msg_content: Union[str, list[dict[str, str]]] = ""
+            last_msg_content: str | list[dict[str, str]] = ""
             if (
                 conversation
                 and "content" in conversation[-1]
@@ -1597,7 +1597,7 @@ class OpenAIServingChat(OpenAIServing):
     def _get_top_logprobs(
         self,
         logprobs: dict[int, Logprob],
-        top_logprobs: Optional[int],
+        top_logprobs: int | None,
         tokenizer: AnyTokenizer,
         should_return_as_token_id: bool,
     ) -> list[ChatCompletionLogProb]:
@@ -1621,10 +1621,10 @@ class OpenAIServingChat(OpenAIServing):
     def _create_chat_logprobs(
         self,
         token_ids: GenericSequence[int],
-        top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
+        top_logprobs: GenericSequence[dict[int, Logprob] | None],
         tokenizer: AnyTokenizer,
-        num_output_top_logprobs: Optional[int] = None,
-        return_as_token_id: Optional[bool] = None,
+        num_output_top_logprobs: int | None = None,
+        return_as_token_id: bool | None = None,
     ) -> ChatCompletionLogProbs:
         """Create OpenAI-style logprobs."""
         logprobs_content: list[ChatCompletionLogProbsContent] = []
@@ -1693,7 +1693,7 @@ class OpenAIServingChat(OpenAIServing):
 
     def _should_check_for_unstreamed_tool_arg_tokens(
         self,
-        delta_message: Optional[DeltaMessage],
+        delta_message: DeltaMessage | None,
         output: CompletionOutput,
     ) -> bool:
         """
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 0e9a5846276bc..45bbe732a680f 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from http import HTTPStatus
-from typing import Optional, Union, cast
+from typing import cast
 
 import numpy as np
 from fastapi import Request
@@ -36,7 +36,7 @@ class ClassificationMixin(OpenAIServing):
     async def _preprocess(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """
         Process classification inputs: tokenize text, resolve adapters,
         and prepare model-specific inputs.
@@ -70,7 +70,7 @@ class ClassificationMixin(OpenAIServing):
     def _build_response(
         self,
         ctx: ServeContext,
-    ) -> Union[ClassificationResponse, ErrorResponse]:
+    ) -> ClassificationResponse | ErrorResponse:
         """
         Convert model outputs to a formatted classification response
         with probabilities and labels.
@@ -129,7 +129,7 @@ class ServingClassification(ClassificationMixin):
         engine_client: EngineClient,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         log_error_stack: bool = False,
     ) -> None:
         super().__init__(
@@ -143,7 +143,7 @@ class ServingClassification(ClassificationMixin):
         self,
         request: ClassificationRequest,
         raw_request: Request,
-    ) -> Union[ClassificationResponse, ErrorResponse]:
+    ) -> ClassificationResponse | ErrorResponse:
         model_name = self.models.model_name()
         request_id = f"{self.request_id_prefix}-{self._base_request_id(raw_request)}"
 
@@ -160,7 +160,7 @@ class ServingClassification(ClassificationMixin):
     def _create_pooling_params(
         self,
         ctx: ClassificationServeContext,
-    ) -> Union[PoolingParams, ErrorResponse]:
+    ) -> PoolingParams | ErrorResponse:
         pooling_params = super()._create_pooling_params(ctx)
         if isinstance(pooling_params, ErrorResponse):
             return pooling_params
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index d18301103e475..7af64306023a3 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -5,7 +5,7 @@ import asyncio
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
-from typing import Optional, Union, cast
+from typing import cast
 
 import jinja2
 from fastapi import Request
@@ -45,7 +45,7 @@ class OpenAIServingCompletion(OpenAIServing):
         engine_client: EngineClient,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         return_tokens_as_token_ids: bool = False,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
@@ -73,8 +73,8 @@ class OpenAIServingCompletion(OpenAIServing):
     async def create_completion(
         self,
         request: CompletionRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> AsyncGenerator[str, None] | CompletionResponse | ErrorResponse:
         """Completion API similar to OpenAI's API.
 
         See https://platform.openai.com/docs/api-reference/completions/create
@@ -166,7 +166,7 @@ class OpenAIServingCompletion(OpenAIServing):
                     default_sampling_params=self.default_sampling_params,
                 )
 
-                sampling_params: Union[SamplingParams, BeamSearchParams]
+                sampling_params: SamplingParams | BeamSearchParams
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
                         max_tokens, self.default_sampling_params
@@ -196,7 +196,7 @@ class OpenAIServingCompletion(OpenAIServing):
                 # Mypy inconsistently requires this second cast in different
                 # environments. It shouldn't be necessary (redundant from above)
                 # but pre-commit in CI fails without it.
-                engine_prompt = cast(Union[EmbedsPrompt, TokensPrompt], engine_prompt)
+                engine_prompt = cast(EmbedsPrompt | TokensPrompt, engine_prompt)
                 if isinstance(sampling_params, BeamSearchParams):
                     generator = self.beam_search(
                         prompt=engine_prompt,
@@ -260,7 +260,7 @@ class OpenAIServingCompletion(OpenAIServing):
             )
 
         # Non-streaming response
-        final_res_batch: list[Optional[RequestOutput]] = [None] * num_prompts
+        final_res_batch: list[RequestOutput | None] = [None] * num_prompts
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
@@ -312,7 +312,7 @@ class OpenAIServingCompletion(OpenAIServing):
     async def completion_stream_generator(
         self,
         request: CompletionRequest,
-        engine_prompts: list[Union[TokensPrompt, EmbedsPrompt]],
+        engine_prompts: list[TokensPrompt | EmbedsPrompt],
         result_generator: AsyncIterator[tuple[int, RequestOutput]],
         request_id: str,
         created_time: int,
@@ -362,7 +362,7 @@ class OpenAIServingCompletion(OpenAIServing):
                     num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
 
                 delta_token_ids: GenericSequence[int]
-                out_logprobs: Optional[GenericSequence[Optional[dict[int, Logprob]]]]
+                out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
 
                 for output in res.outputs:
                     i = output.index + prompt_idx * num_choices
@@ -370,7 +370,7 @@ class OpenAIServingCompletion(OpenAIServing):
                     # Useful when request.return_token_ids is True
                     # Returning prompt token IDs shares the same logic
                     # with the echo implementation.
-                    prompt_token_ids_to_return: Optional[list[int]] = None
+                    prompt_token_ids_to_return: list[int] | None = None
 
                     assert request.max_tokens is not None
                     if request.echo and not has_echoed[i]:
@@ -524,7 +524,7 @@ class OpenAIServingCompletion(OpenAIServing):
             prompt_text = final_res.prompt
 
             token_ids: GenericSequence[int]
-            out_logprobs: Optional[GenericSequence[Optional[dict[int, Logprob]]]]
+            out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
 
             for output in final_res.outputs:
                 assert request.max_tokens is not None
@@ -617,17 +617,17 @@ class OpenAIServingCompletion(OpenAIServing):
     def _create_completion_logprobs(
         self,
         token_ids: GenericSequence[int],
-        top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
+        top_logprobs: GenericSequence[dict[int, Logprob] | None],
         num_output_top_logprobs: int,
         tokenizer: AnyTokenizer,
         initial_text_offset: int = 0,
-        return_as_token_id: Optional[bool] = None,
+        return_as_token_id: bool | None = None,
     ) -> CompletionLogProbs:
         """Create logprobs for OpenAI Completion API."""
         out_text_offset: list[int] = []
-        out_token_logprobs: list[Optional[float]] = []
+        out_token_logprobs: list[float | None] = []
         out_tokens: list[str] = []
-        out_top_logprobs: list[Optional[dict[str, float]]] = []
+        out_top_logprobs: list[dict[str, float] | None] = []
 
         last_token_len = 0
 
@@ -695,7 +695,7 @@ class OpenAIServingCompletion(OpenAIServing):
     def _build_render_config(
         self,
         request: CompletionRequest,
-        max_input_length: Optional[int] = None,
+        max_input_length: int | None = None,
     ) -> RenderConfig:
         max_input_tokens_len = self.max_model_len - (request.max_tokens or 0)
         return RenderConfig(
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index e0c9d9aa812f6..8f1df9a5aea62 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -3,7 +3,7 @@
 
 import base64
 from collections.abc import AsyncGenerator, Mapping
-from typing import Any, Final, Literal, Optional, Union, cast
+from typing import Any, Final, Literal, cast
 
 import numpy as np
 import torch
@@ -48,7 +48,7 @@ logger = init_logger(__name__)
 def _get_embedding(
     output: EmbeddingOutput,
     encoding_format: Literal["float", "base64"],
-) -> Union[list[float], str]:
+) -> list[float] | str:
     if encoding_format == "float":
         return output.embedding
     elif encoding_format == "base64":
@@ -80,7 +80,7 @@ class EmbeddingMixin(OpenAIServing):
     async def _preprocess(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         ctx = cast(EmbeddingServeContext, ctx)
         try:
             ctx.lora_request = self._maybe_get_adapters(ctx.request)
@@ -130,7 +130,7 @@ class EmbeddingMixin(OpenAIServing):
     def _build_response(
         self,
         ctx: ServeContext,
-    ) -> Union[EmbeddingResponse, ErrorResponse]:
+    ) -> EmbeddingResponse | ErrorResponse:
         items: list[EmbeddingResponseData] = []
         num_prompt_tokens = 0
 
@@ -314,9 +314,9 @@ class EmbeddingMixin(OpenAIServing):
         ctx: EmbeddingServeContext,
         engine_prompt: EngineTokensPrompt,
         pooling_params: PoolingParams,
-        trace_headers: Optional[Mapping[str, str]],
+        trace_headers: Mapping[str, str] | None,
         prompt_index: int,
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+    ) -> AsyncGenerator[RequestOutput | PoolingRequestOutput, None]:
         """Create a generator for a single prompt using standard processing."""
         request_id_item = f"{ctx.request_id}-{prompt_index}"
 
@@ -341,7 +341,7 @@ class EmbeddingMixin(OpenAIServing):
     async def _prepare_generators(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """Override to support chunked processing."""
         ctx = cast(EmbeddingServeContext, ctx)
 
@@ -354,7 +354,7 @@ class EmbeddingMixin(OpenAIServing):
 
         # Custom logic for chunked processing
         generators: list[
-            AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]
+            AsyncGenerator[RequestOutput | PoolingRequestOutput, None]
         ] = []
 
         try:
@@ -413,7 +413,7 @@ class EmbeddingMixin(OpenAIServing):
     async def _collect_batch(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """Collect and aggregate batch results
         with support for chunked processing.
 
@@ -522,9 +522,7 @@ class EmbeddingMixin(OpenAIServing):
                     )
 
             # Finalize aggregated results
-            final_res_batch: list[
-                Union[PoolingRequestOutput, EmbeddingRequestOutput]
-            ] = []
+            final_res_batch: list[PoolingRequestOutput | EmbeddingRequestOutput] = []
             num_prompts = len(ctx.engine_prompts)
 
             for prompt_idx in range(num_prompts):
@@ -581,7 +579,7 @@ class EmbeddingMixin(OpenAIServing):
                     )
 
             ctx.final_res_batch = cast(
-                list[Union[RequestOutput, PoolingRequestOutput]], final_res_batch
+                list[RequestOutput | PoolingRequestOutput], final_res_batch
             )
 
             return None
@@ -598,8 +596,8 @@ class OpenAIServingEmbedding(EmbeddingMixin):
         engine_client: EngineClient,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
-        chat_template: Optional[str],
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
         trust_request_chat_template: bool = False,
         log_error_stack: bool = False,
@@ -618,8 +616,8 @@ class OpenAIServingEmbedding(EmbeddingMixin):
     async def create_embedding(
         self,
         request: EmbeddingRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[EmbeddingResponse, ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> EmbeddingResponse | ErrorResponse:
         """
         Embedding API similar to OpenAI's API.
 
@@ -647,7 +645,7 @@ class OpenAIServingEmbedding(EmbeddingMixin):
     def _create_pooling_params(
         self,
         ctx: ServeContext[EmbeddingRequest],
-    ) -> Union[PoolingParams, ErrorResponse]:
+    ) -> PoolingParams | ErrorResponse:
         pooling_params = super()._create_pooling_params(ctx)
         if isinstance(pooling_params, ErrorResponse):
             return pooling_params
@@ -662,7 +660,7 @@ class OpenAIServingEmbedding(EmbeddingMixin):
     async def _preprocess(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         if isinstance(ctx.request, EmbeddingChatRequest):
             error_check_ret = self._validate_chat_template(
                 request_chat_template=ctx.request.chat_template,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 0d1a525c6d3da..a041950ffd20b 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -5,10 +5,10 @@ import json
 import sys
 import time
 import traceback
-from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence
+from collections.abc import AsyncGenerator, Callable, Iterable, Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
 from http import HTTPStatus
-from typing import Any, Callable, ClassVar, Generic, Optional, TypeVar, Union
+from typing import Any, ClassVar, Generic, TypeAlias, TypeVar
 
 import torch
 from fastapi import Request
@@ -102,38 +102,38 @@ from vllm.v1.engine import EngineCoreRequest
 
 logger = init_logger(__name__)
 
-CompletionLikeRequest = Union[
-    CompletionRequest,
-    DetokenizeRequest,
-    EmbeddingCompletionRequest,
-    RerankRequest,
-    ClassificationRequest,
-    ScoreRequest,
-    TokenizeCompletionRequest,
-]
+CompletionLikeRequest: TypeAlias = (
+    CompletionRequest
+    | DetokenizeRequest
+    | EmbeddingCompletionRequest
+    | RerankRequest
+    | ClassificationRequest
+    | ScoreRequest
+    | TokenizeCompletionRequest
+)
 
-ChatLikeRequest = Union[
-    ChatCompletionRequest, EmbeddingChatRequest, TokenizeChatRequest
-]
-SpeechToTextRequest = Union[TranscriptionRequest, TranslationRequest]
-AnyRequest = Union[
-    CompletionLikeRequest,
-    ChatLikeRequest,
-    SpeechToTextRequest,
-    ResponsesRequest,
-    IOProcessorRequest,
-]
+ChatLikeRequest: TypeAlias = (
+    ChatCompletionRequest | EmbeddingChatRequest | TokenizeChatRequest
+)
+SpeechToTextRequest: TypeAlias = TranscriptionRequest | TranslationRequest
+AnyRequest: TypeAlias = (
+    CompletionLikeRequest
+    | ChatLikeRequest
+    | SpeechToTextRequest
+    | ResponsesRequest
+    | IOProcessorRequest
+)
 
-AnyResponse = Union[
-    CompletionResponse,
-    ChatCompletionResponse,
-    EmbeddingResponse,
-    TranscriptionResponse,
-    TokenizeResponse,
-    PoolingResponse,
-    ClassificationResponse,
-    ScoreResponse,
-]
+AnyResponse: TypeAlias = (
+    CompletionResponse
+    | ChatCompletionResponse
+    | EmbeddingResponse
+    | TranscriptionResponse
+    | TokenizeResponse
+    | PoolingResponse
+    | ClassificationResponse
+    | ScoreResponse
+)
 
 
 class TextTokensPrompt(TypedDict):
@@ -145,7 +145,7 @@ class EmbedsPrompt(TypedDict):
     prompt_embeds: torch.Tensor
 
 
-RequestPrompt = Union[list[int], str, TextTokensPrompt, EmbedsPrompt]
+RequestPrompt: TypeAlias = list[int] | str | TextTokensPrompt | EmbedsPrompt
 
 
 def is_text_tokens_prompt(prompt: RequestPrompt) -> TypeIs[TextTokensPrompt]:
@@ -173,8 +173,8 @@ class RequestProcessingMixin(BaseModel):
     handling prompt preparation and engine input.
     """
 
-    request_prompts: Optional[Sequence[RequestPrompt]] = []
-    engine_prompts: Optional[list[EngineTokensPrompt]] = []
+    request_prompts: Sequence[RequestPrompt] | None = []
+    engine_prompts: list[EngineTokensPrompt] | None = []
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -185,10 +185,10 @@ class ResponseGenerationMixin(BaseModel):
     managing result generators and final batch results.
     """
 
-    result_generator: Optional[
-        AsyncGenerator[tuple[int, Union[RequestOutput, PoolingRequestOutput]], None]
-    ] = None
-    final_res_batch: list[Union[RequestOutput, PoolingRequestOutput]] = Field(
+    result_generator: (
+        AsyncGenerator[tuple[int, RequestOutput | PoolingRequestOutput], None] | None
+    ) = None
+    final_res_batch: list[RequestOutput | PoolingRequestOutput] = Field(
         default_factory=list
     )
 
@@ -203,14 +203,14 @@ class ServeContext(
 ):
     # Shared across all requests
     request: RequestT
-    raw_request: Optional[Request] = None
+    raw_request: Request | None = None
     model_name: str
     request_id: str
     created_time: int = Field(default_factory=lambda: int(time.time()))
-    lora_request: Optional[LoRARequest] = None
+    lora_request: LoRARequest | None = None
 
     # Shared across most requests
-    tokenizer: Optional[AnyTokenizer] = None
+    tokenizer: AnyTokenizer | None = None
 
     # `protected_namespaces` resolves Pydantic v2's warning
     # on conflict with protected namespace "model_"
@@ -224,7 +224,7 @@ ClassificationServeContext = ServeContext[ClassificationRequest]
 
 
 class EmbeddingServeContext(ServeContext[EmbeddingRequest]):
-    chat_template: Optional[str] = None
+    chat_template: str | None = None
     chat_template_content_format: ChatTemplateContentFormatOption
 
 
@@ -247,7 +247,7 @@ class OpenAIServing:
         engine_client: EngineClient,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         return_tokens_as_token_ids: bool = False,
         enable_force_include_usage: bool = False,
         log_error_stack: bool = False,
@@ -276,8 +276,8 @@ class OpenAIServing:
         self.max_model_len = self.model_config.max_model_len
 
     def _get_tool_parser(
-        self, tool_parser_name: Optional[str] = None, enable_auto_tools: bool = False
-    ) -> Optional[Callable[[AnyTokenizer], ToolParser]]:
+        self, tool_parser_name: str | None = None, enable_auto_tools: bool = False
+    ) -> Callable[[AnyTokenizer], ToolParser] | None:
         """Get the tool parser based on the name."""
         parser = None
         if not enable_auto_tools or tool_parser_name is None:
@@ -307,7 +307,7 @@ class OpenAIServing:
     def _get_reasoning_parser(
         self,
         reasoning_parser_name: str,
-    ) -> Optional[Callable[[AnyTokenizer], ReasoningParser]]:
+    ) -> Callable[[AnyTokenizer], ReasoningParser] | None:
         """Get the reasoning parser based on the name."""
         parser = None
         if not reasoning_parser_name:
@@ -328,7 +328,7 @@ class OpenAIServing:
         prompt: PromptType,
         request_id: str,
         params: BeamSearchParams,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: LoRARequest | None = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         beam_width = params.beam_width
         max_tokens = params.max_tokens
@@ -364,9 +364,9 @@ class OpenAIServing:
         #    this happens again in generation, so the double expansion causes
         #    a mismatch.
         # TODO - would be ideal to handle this more gracefully.
-        prompt_text: Optional[str]
+        prompt_text: str | None
         prompt_token_ids: list[int]
-        multi_modal_data: Optional[MultiModalDataDict]
+        multi_modal_data: MultiModalDataDict | None
         if isinstance(prompt, str):
             prompt_text = prompt
             prompt_token_ids = []
@@ -376,7 +376,7 @@ class OpenAIServing:
             prompt_token_ids = prompt.get("prompt_token_ids", [])  # type: ignore
             multi_modal_data = prompt.get("multi_modal_data")  # type: ignore
 
-        mm_processor_kwargs: Optional[dict[str, Any]] = processed_inputs.get(
+        mm_processor_kwargs: dict[str, Any] | None = processed_inputs.get(
             "mm_processor_kwargs"
         )  # type: ignore
 
@@ -507,7 +507,7 @@ class OpenAIServing:
             prompt_logprobs=None,
         )
 
-    def _get_renderer(self, tokenizer: Optional[AnyTokenizer]) -> BaseRenderer:
+    def _get_renderer(self, tokenizer: AnyTokenizer | None) -> BaseRenderer:
         """
         Get a Renderer instance with the provided tokenizer.
         Uses shared async tokenizer pool for efficiency.
@@ -545,7 +545,7 @@ class OpenAIServing:
     async def _preprocess(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """
         Default preprocessing hook. Subclasses may override
         to prepare `ctx` (classification, embedding, etc.).
@@ -555,7 +555,7 @@ class OpenAIServing:
     def _build_response(
         self,
         ctx: ServeContext,
-    ) -> Union[AnyResponse, ErrorResponse]:
+    ) -> AnyResponse | ErrorResponse:
         """
         Default response builder. Subclass may override this method
         to return the appropriate response object.
@@ -565,8 +565,8 @@ class OpenAIServing:
     async def handle(
         self,
         ctx: ServeContext,
-    ) -> Union[AnyResponse, ErrorResponse]:
-        generation: AsyncGenerator[Union[AnyResponse, ErrorResponse], None]
+    ) -> AnyResponse | ErrorResponse:
+        generation: AsyncGenerator[AnyResponse | ErrorResponse, None]
         generation = self._pipeline(ctx)
 
         async for response in generation:
@@ -577,7 +577,7 @@ class OpenAIServing:
     async def _pipeline(
         self,
         ctx: ServeContext,
-    ) -> AsyncGenerator[Union[AnyResponse, ErrorResponse], None]:
+    ) -> AsyncGenerator[AnyResponse | ErrorResponse, None]:
         """Execute the request processing pipeline yielding responses."""
         if error := await self._check_model(ctx.request):
             yield error
@@ -598,7 +598,7 @@ class OpenAIServing:
 
         yield self._build_response(ctx)
 
-    def _validate_request(self, ctx: ServeContext) -> Optional[ErrorResponse]:
+    def _validate_request(self, ctx: ServeContext) -> ErrorResponse | None:
         truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens", None)
 
         if (
@@ -615,7 +615,7 @@ class OpenAIServing:
     def _create_pooling_params(
         self,
         ctx: ServeContext,
-    ) -> Union[PoolingParams, ErrorResponse]:
+    ) -> PoolingParams | ErrorResponse:
         if not hasattr(ctx.request, "to_pooling_params"):
             return self.create_error_response(
                 "Request type does not support pooling parameters"
@@ -626,10 +626,10 @@ class OpenAIServing:
     async def _prepare_generators(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """Schedule the request and get the result generator."""
         generators: list[
-            AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]
+            AsyncGenerator[RequestOutput | PoolingRequestOutput, None]
         ] = []
 
         try:
@@ -678,14 +678,14 @@ class OpenAIServing:
     async def _collect_batch(
         self,
         ctx: ServeContext,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """Collect batch results from the result generator."""
         try:
             if ctx.engine_prompts is None:
                 return self.create_error_response("Engine prompts not available")
 
             num_prompts = len(ctx.engine_prompts)
-            final_res_batch: list[Optional[Union[RequestOutput, PoolingRequestOutput]]]
+            final_res_batch: list[RequestOutput | PoolingRequestOutput | None]
             final_res_batch = [None] * num_prompts
 
             if ctx.result_generator is None:
@@ -738,7 +738,7 @@ class OpenAIServing:
     async def _check_model(
         self,
         request: AnyRequest,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         error_response = None
 
         if self._is_model_supported(request.model):
@@ -764,9 +764,7 @@ class OpenAIServing:
             status_code=HTTPStatus.NOT_FOUND,
         )
 
-    def _get_active_default_mm_loras(
-        self, request: AnyRequest
-    ) -> Optional[LoRARequest]:
+    def _get_active_default_mm_loras(self, request: AnyRequest) -> LoRARequest | None:
         """Determine if there are any active default multimodal loras."""
         # TODO: Currently this is only enabled for chat completions
         # to be better aligned with only being enabled for .generate
@@ -793,7 +791,7 @@ class OpenAIServing:
         self,
         request: AnyRequest,
         supports_default_mm_loras: bool = False,
-    ) -> Optional[LoRARequest]:
+    ) -> LoRARequest | None:
         if request.model in self.models.lora_requests:
             return self.models.lora_requests[request.model]
 
@@ -877,7 +875,7 @@ class OpenAIServing:
         self,
         request: AnyRequest,
         prompt_ids: list[int],
-        tokenizer: Optional[AnyTokenizer],
+        tokenizer: AnyTokenizer | None,
     ) -> TextTokensPrompt:
         truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
 
@@ -972,7 +970,7 @@ class OpenAIServing:
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
-        prompt_input: Union[str, list[int]],
+        prompt_input: str | list[int],
         add_special_tokens: bool = True,
     ) -> TextTokensPrompt:
         """
@@ -991,7 +989,7 @@ class OpenAIServing:
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
-        prompt_inputs: Iterable[Union[str, list[int]]],
+        prompt_inputs: Iterable[str | list[int]],
         add_special_tokens: bool = True,
     ) -> AsyncGenerator[TextTokensPrompt, None]:
         """
@@ -1014,10 +1012,10 @@ class OpenAIServing:
 
     def _validate_chat_template(
         self,
-        request_chat_template: Optional[str],
-        chat_template_kwargs: Optional[dict[str, Any]],
+        request_chat_template: str | None,
+        chat_template_kwargs: dict[str, Any] | None,
         trust_request_chat_template: bool,
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         if not trust_request_chat_template and (
             request_chat_template is not None
             or (
@@ -1034,17 +1032,17 @@ class OpenAIServing:
 
     async def _preprocess_chat(
         self,
-        request: Union[ChatLikeRequest, ResponsesRequest],
+        request: ChatLikeRequest | ResponsesRequest,
         tokenizer: AnyTokenizer,
         messages: list[ChatCompletionMessageParam],
-        chat_template: Optional[str],
+        chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
-        tool_dicts: Optional[list[dict[str, Any]]] = None,
-        documents: Optional[list[dict[str, str]]] = None,
-        chat_template_kwargs: Optional[dict[str, Any]] = None,
-        tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
+        tool_dicts: list[dict[str, Any]] | None = None,
+        documents: list[dict[str, str]] | None = None,
+        chat_template_kwargs: dict[str, Any] | None = None,
+        tool_parser: Callable[[AnyTokenizer], ToolParser] | None = None,
         add_special_tokens: bool = False,
     ) -> tuple[
         list[ConversationMessage],
@@ -1076,7 +1074,7 @@ class OpenAIServing:
         )
         _chat_template_kwargs.update(chat_template_kwargs or {})
 
-        request_prompt: Union[str, list[int]]
+        request_prompt: str | list[int]
 
         if tokenizer is None:
             request_prompt = "placeholder"
@@ -1158,10 +1156,10 @@ class OpenAIServing:
         self,
         request_id: str,
         engine_prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams | PoolingParams,
         *,
-        lora_request: Optional[LoRARequest],
-        trace_headers: Optional[Mapping[str, str]],
+        lora_request: LoRARequest | None,
+        trace_headers: Mapping[str, str] | None,
         priority: int,
     ) -> tuple[EngineCoreRequest, dict[str, Any]]:
         """Use the Processor to process inputs for AsyncLLM."""
@@ -1188,7 +1186,7 @@ class OpenAIServing:
         engine_prompt: EngineTokensPrompt,
         sampling_params: SamplingParams,
         context: ConversationContext,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: LoRARequest | None = None,
         priority: int = 0,
         **kwargs,
     ):
@@ -1250,7 +1248,7 @@ class OpenAIServing:
 
     def _get_prompt_components(
         self,
-        prompt: Union[RequestPrompt, PromptType],
+        prompt: RequestPrompt | PromptType,
     ) -> PromptComponents:
         if isinstance(prompt, list):
             return PromptComponents(token_ids=prompt)
@@ -1260,9 +1258,9 @@ class OpenAIServing:
     def _log_inputs(
         self,
         request_id: str,
-        inputs: Union[RequestPrompt, PromptType],
-        params: Optional[Union[SamplingParams, PoolingParams, BeamSearchParams]],
-        lora_request: Optional[LoRARequest],
+        inputs: RequestPrompt | PromptType,
+        params: SamplingParams | PoolingParams | BeamSearchParams | None,
+        lora_request: LoRARequest | None,
     ) -> None:
         if self.request_logger is None:
             return
@@ -1281,7 +1279,7 @@ class OpenAIServing:
     async def _get_trace_headers(
         self,
         headers: Headers,
-    ) -> Optional[Mapping[str, str]]:
+    ) -> Mapping[str, str] | None:
         is_tracing_enabled = await self.engine_client.is_tracing_enabled()
 
         if is_tracing_enabled:
@@ -1294,8 +1292,8 @@ class OpenAIServing:
 
     @staticmethod
     def _base_request_id(
-        raw_request: Optional[Request], default: Optional[str] = None
-    ) -> Optional[str]:
+        raw_request: Request | None, default: str | None = None
+    ) -> str | None:
         """Pulls the request id to use from a header, if provided"""
         default = default or random_uuid()
         if raw_request is None:
@@ -1317,15 +1315,15 @@ class OpenAIServing:
             return logprob.decoded_token
         return tokenizer.decode(token_id)
 
-    def _is_model_supported(self, model_name: Optional[str]) -> bool:
+    def _is_model_supported(self, model_name: str | None) -> bool:
         if not model_name:
             return True
         return self.models.is_base_model(model_name)
 
 
 def clamp_prompt_logprobs(
-    prompt_logprobs: Union[PromptLogprobs, None],
-) -> Union[PromptLogprobs, None]:
+    prompt_logprobs: PromptLogprobs | None,
+) -> PromptLogprobs | None:
     if prompt_logprobs is None:
         return prompt_logprobs
 
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 1aaac60f29933..9b7deb40b93f6 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -5,7 +5,6 @@ from asyncio import Lock
 from collections import defaultdict
 from dataclasses import dataclass
 from http import HTTPStatus
-from typing import Optional, Union
 
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (
@@ -35,7 +34,7 @@ class BaseModelPath:
 class LoRAModulePath:
     name: str
     path: str
-    base_model_name: Optional[str] = None
+    base_model_name: str | None = None
 
 
 class OpenAIServingModels:
@@ -52,7 +51,7 @@ class OpenAIServingModels:
         engine_client: EngineClient,
         base_model_paths: list[BaseModelPath],
         *,
-        lora_modules: Optional[list[LoRAModulePath]] = None,
+        lora_modules: list[LoRAModulePath] | None = None,
     ):
         super().__init__()
 
@@ -93,7 +92,7 @@ class OpenAIServingModels:
     def is_base_model(self, model_name) -> bool:
         return any(model.name == model_name for model in self.base_model_paths)
 
-    def model_name(self, lora_request: Optional[LoRARequest] = None) -> str:
+    def model_name(self, lora_request: LoRARequest | None = None) -> str:
         """Returns the appropriate model name depending on the availability
         and support of the LoRA or base model.
         Parameters:
@@ -132,8 +131,8 @@ class OpenAIServingModels:
         return ModelList(data=model_cards)
 
     async def load_lora_adapter(
-        self, request: LoadLoRAAdapterRequest, base_model_name: Optional[str] = None
-    ) -> Union[ErrorResponse, str]:
+        self, request: LoadLoRAAdapterRequest, base_model_name: str | None = None
+    ) -> ErrorResponse | str:
         lora_name = request.lora_name
 
         # Ensure atomicity based on the lora name
@@ -173,7 +172,7 @@ class OpenAIServingModels:
 
     async def unload_lora_adapter(
         self, request: UnloadLoRAAdapterRequest
-    ) -> Union[ErrorResponse, str]:
+    ) -> ErrorResponse | str:
         lora_name = request.lora_name
 
         # Ensure atomicity based on the lora name
@@ -189,7 +188,7 @@ class OpenAIServingModels:
 
     async def _check_load_lora_adapter_request(
         self, request: LoadLoRAAdapterRequest
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         # Check if both 'lora_name' and 'lora_path' are provided
         if not request.lora_name or not request.lora_path:
             return create_error_response(
@@ -211,7 +210,7 @@ class OpenAIServingModels:
 
     async def _check_unload_lora_adapter_request(
         self, request: UnloadLoRAAdapterRequest
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         # Check if 'lora_name' is not provided return an error
         if not request.lora_name:
             return create_error_response(
@@ -230,7 +229,7 @@ class OpenAIServingModels:
 
         return None
 
-    async def resolve_lora(self, lora_name: str) -> Union[LoRARequest, ErrorResponse]:
+    async def resolve_lora(self, lora_name: str) -> LoRARequest | ErrorResponse:
         """Attempt to resolve a LoRA adapter using available resolvers.
 
         Args:
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 964655fb7f65e..39cc539c11871 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -5,7 +5,7 @@ import asyncio
 import base64
 import time
 from collections.abc import AsyncGenerator
-from typing import Final, Literal, Optional, Union, cast
+from typing import Final, Literal, cast
 
 import jinja2
 import numpy as np
@@ -41,7 +41,7 @@ logger = init_logger(__name__)
 def _get_data(
     output: PoolingOutput,
     encoding_format: Literal["float", "base64"],
-) -> Union[list[float], str]:
+) -> list[float] | str:
     if encoding_format == "float":
         return output.data.tolist()
     elif encoding_format == "base64":
@@ -60,8 +60,8 @@ class OpenAIServingPooling(OpenAIServing):
         engine_client: EngineClient,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
-        chat_template: Optional[str],
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
         trust_request_chat_template: bool = False,
         log_error_stack: bool = False,
@@ -80,8 +80,8 @@ class OpenAIServingPooling(OpenAIServing):
     async def create_pooling(
         self,
         request: PoolingRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[PoolingResponse, IOProcessorResponse, ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> PoolingResponse | IOProcessorResponse | ErrorResponse:
         """
         See https://platform.openai.com/docs/api-reference/embeddings/create
         for the API specification. This API mimics the OpenAI Embedding API.
@@ -219,7 +219,7 @@ class OpenAIServingPooling(OpenAIServing):
         num_prompts = len(engine_prompts)
 
         # Non-streaming response
-        final_res_batch: list[Optional[PoolingRequestOutput]]
+        final_res_batch: list[PoolingRequestOutput | None]
         final_res_batch = [None] * num_prompts
         try:
             async for i, res in result_generator:
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 60f8b78ed1757..3b9015efd305d 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -6,11 +6,11 @@ import json
 import time
 import uuid
 from collections import deque
-from collections.abc import AsyncGenerator, AsyncIterator, Sequence
+from collections.abc import AsyncGenerator, AsyncIterator, Callable, Sequence
 from contextlib import AsyncExitStack
 from copy import copy
 from http import HTTPStatus
-from typing import Callable, Final, Optional, Union
+from typing import Final
 
 import jinja2
 from fastapi import Request
@@ -109,14 +109,14 @@ class OpenAIServingResponses(OpenAIServing):
         engine_client: EngineClient,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
-        chat_template: Optional[str],
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
         return_tokens_as_token_ids: bool = False,
         reasoning_parser: str = "",
         enable_auto_tools: bool = False,
-        tool_parser: Optional[str] = None,
-        tool_server: Optional[ToolServer] = None,
+        tool_parser: str | None = None,
+        tool_server: ToolServer | None = None,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
@@ -210,7 +210,7 @@ class OpenAIServingResponses(OpenAIServing):
 
     def _validate_generator_input(
         self, engine_prompt: EngineTokensPrompt
-    ) -> Optional[ErrorResponse]:
+    ) -> ErrorResponse | None:
         """Add validations to the input to the generator here."""
         if self.max_model_len <= len(engine_prompt["prompt_token_ids"]):
             error_message = (
@@ -229,12 +229,12 @@ class OpenAIServingResponses(OpenAIServing):
     async def create_responses(
         self,
         request: ResponsesRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[
-        AsyncGenerator[StreamingResponsesResponse, None],
-        ResponsesResponse,
-        ErrorResponse,
-    ]:
+        raw_request: Request | None = None,
+    ) -> (
+        AsyncGenerator[StreamingResponsesResponse, None]
+        | ResponsesResponse
+        | ErrorResponse
+    ):
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             logger.error("Error with model %s", error_check_ret)
@@ -460,7 +460,7 @@ class OpenAIServingResponses(OpenAIServing):
     async def _make_request(
         self,
         request: ResponsesRequest,
-        prev_response: Optional[ResponsesResponse],
+        prev_response: ResponsesResponse | None,
         tokenizer: AnyTokenizer,
     ):
         if len(request.tools) > 0:
@@ -481,7 +481,7 @@ class OpenAIServingResponses(OpenAIServing):
     def _make_request_with_harmony(
         self,
         request: ResponsesRequest,
-        prev_response: Optional[ResponsesResponse],
+        prev_response: ResponsesResponse | None,
     ):
         if request.tool_choice != "auto":
             raise NotImplementedError(
@@ -522,8 +522,8 @@ class OpenAIServingResponses(OpenAIServing):
         model_name: str,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
-        created_time: Optional[int] = None,
-    ) -> Union[ErrorResponse, ResponsesResponse]:
+        created_time: int | None = None,
+    ) -> ErrorResponse | ResponsesResponse:
         if created_time is None:
             created_time = int(time.time())
 
@@ -642,9 +642,9 @@ class OpenAIServingResponses(OpenAIServing):
     def _create_response_logprobs(
         self,
         token_ids: Sequence[int],
-        logprobs: Optional[SampleLogprobs],
+        logprobs: SampleLogprobs | None,
         tokenizer: AnyTokenizer,
-        top_logprobs: Optional[int] = None,
+        top_logprobs: int | None = None,
     ) -> list[Logprob]:
         assert logprobs is not None, "logprobs must be provided"
         assert len(token_ids) == len(logprobs), (
@@ -676,9 +676,9 @@ class OpenAIServingResponses(OpenAIServing):
     def _create_stream_response_logprobs(
         self,
         token_ids: Sequence[int],
-        logprobs: Optional[SampleLogprobs],
+        logprobs: SampleLogprobs | None,
         tokenizer: AnyTokenizer,
-        top_logprobs: Optional[int] = None,
+        top_logprobs: int | None = None,
     ) -> list[response_text_delta_event.Logprob]:
         lgs = self._create_response_logprobs(
             token_ids=token_ids,
@@ -793,7 +793,7 @@ class OpenAIServingResponses(OpenAIServing):
     def _construct_input_messages(
         self,
         request: ResponsesRequest,
-        prev_response: Optional[ResponsesResponse] = None,
+        prev_response: ResponsesResponse | None = None,
     ) -> list[ChatCompletionMessageParam]:
         messages: list[ChatCompletionMessageParam] = []
         if request.instructions:
@@ -833,7 +833,7 @@ class OpenAIServingResponses(OpenAIServing):
     def _construct_input_messages_with_harmony(
         self,
         request: ResponsesRequest,
-        prev_response: Optional[ResponsesResponse],
+        prev_response: ResponsesResponse | None,
     ) -> list[OpenAIHarmonyMessage]:
         messages: list[OpenAIHarmonyMessage] = []
         if prev_response is None:
@@ -986,7 +986,7 @@ class OpenAIServingResponses(OpenAIServing):
     async def responses_background_stream_generator(
         self,
         response_id: str,
-        starting_after: Optional[int] = None,
+        starting_after: int | None = None,
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
         if response_id not in self.event_store:
             raise ValueError(f"Unknown response_id: {response_id}")
@@ -1011,13 +1011,13 @@ class OpenAIServingResponses(OpenAIServing):
     async def retrieve_responses(
         self,
         response_id: str,
-        starting_after: Optional[int],
-        stream: Optional[bool],
-    ) -> Union[
-        ErrorResponse,
-        ResponsesResponse,
-        AsyncGenerator[StreamingResponsesResponse, None],
-    ]:
+        starting_after: int | None,
+        stream: bool | None,
+    ) -> (
+        ErrorResponse
+        | ResponsesResponse
+        | AsyncGenerator[StreamingResponsesResponse, None]
+    ):
         async with self.response_store_lock:
             response = self.response_store.get(response_id)
 
@@ -1034,7 +1034,7 @@ class OpenAIServingResponses(OpenAIServing):
     async def cancel_responses(
         self,
         response_id: str,
-    ) -> Union[ErrorResponse, ResponsesResponse]:
+    ) -> ErrorResponse | ResponsesResponse:
         async with self.response_store_lock:
             response = self.response_store.get(response_id)
             if response is None:
@@ -1082,7 +1082,7 @@ class OpenAIServingResponses(OpenAIServing):
         self,
         request: ResponsesRequest,
         sampling_params: SamplingParams,
-        result_generator: AsyncIterator[Optional[ConversationContext]],
+        result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
         tokenizer: AnyTokenizer,
@@ -1385,7 +1385,7 @@ class OpenAIServingResponses(OpenAIServing):
         self,
         request: ResponsesRequest,
         sampling_params: SamplingParams,
-        result_generator: AsyncIterator[Optional[ConversationContext]],
+        result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
         tokenizer: AnyTokenizer,
@@ -1772,12 +1772,12 @@ class OpenAIServingResponses(OpenAIServing):
         self,
         request: ResponsesRequest,
         sampling_params: SamplingParams,
-        result_generator: AsyncIterator[Optional[ConversationContext]],
+        result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
-        created_time: Optional[int] = None,
+        created_time: int | None = None,
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
         # TODO:
         # 1. Handle disconnect
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 84ea33a07fa58..7506e17fe585b 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -3,7 +3,7 @@
 import asyncio
 import time
 from collections.abc import AsyncGenerator, Mapping
-from typing import Any, Optional, Union
+from typing import Any
 
 from fastapi import Request
 
@@ -48,7 +48,7 @@ class ServingScores(OpenAIServing):
         engine_client: EngineClient,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         log_error_stack: bool = False,
     ) -> None:
         super().__init__(
@@ -63,12 +63,12 @@ class ServingScores(OpenAIServing):
         tokenizer: AnyTokenizer,
         texts_1: list[str],
         texts_2: list[str],
-        request: Union[RerankRequest, ScoreRequest],
+        request: RerankRequest | ScoreRequest,
         request_id: str,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        lora_request: Optional[Union[LoRARequest, None]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-    ) -> Union[list[PoolingRequestOutput], ErrorResponse]:
+        tokenization_kwargs: dict[str, Any] | None = None,
+        lora_request: LoRARequest | None | None = None,
+        trace_headers: Mapping[str, str] | None = None,
+    ) -> list[PoolingRequestOutput] | ErrorResponse:
         input_texts = texts_1 + texts_2
 
         engine_prompts: list[TokensPrompt] = []
@@ -125,7 +125,7 @@ class ServingScores(OpenAIServing):
         # Non-streaming response
         final_res_batch: list[PoolingRequestOutput] = []
 
-        embeddings: list[Optional[PoolingRequestOutput]] = [None] * len(engine_prompts)
+        embeddings: list[PoolingRequestOutput | None] = [None] * len(engine_prompts)
 
         async for i, res in result_generator:
             embeddings[i] = res
@@ -152,11 +152,11 @@ class ServingScores(OpenAIServing):
 
     def _preprocess_score(
         self,
-        request: Union[RerankRequest, ScoreRequest],
+        request: RerankRequest | ScoreRequest,
         tokenizer: AnyTokenizer,
         tokenization_kwargs: dict[str, Any],
-        data_1: Union[str, ScoreContentPartParam],
-        data_2: Union[str, ScoreContentPartParam],
+        data_1: str | ScoreContentPartParam,
+        data_2: str | ScoreContentPartParam,
     ) -> tuple[str, TokensPrompt]:
         model_config = self.model_config
 
@@ -176,14 +176,14 @@ class ServingScores(OpenAIServing):
     async def _cross_encoding_score(
         self,
         tokenizer: AnyTokenizer,
-        data_1: Union[list[str], list[ScoreContentPartParam]],
-        data_2: Union[list[str], list[ScoreContentPartParam]],
-        request: Union[RerankRequest, ScoreRequest],
+        data_1: list[str] | list[ScoreContentPartParam],
+        data_2: list[str] | list[ScoreContentPartParam],
+        request: RerankRequest | ScoreRequest,
         request_id: str,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        lora_request: Optional[Union[LoRARequest, None]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-    ) -> Union[list[PoolingRequestOutput], ErrorResponse]:
+        tokenization_kwargs: dict[str, Any] | None = None,
+        lora_request: LoRARequest | None | None = None,
+        trace_headers: Mapping[str, str] | None = None,
+    ) -> list[PoolingRequestOutput] | ErrorResponse:
         request_prompts: list[str] = []
         engine_prompts: list[TokensPrompt] = []
 
@@ -259,7 +259,7 @@ class ServingScores(OpenAIServing):
         result_generator = merge_async_iterators(*generators)
 
         # Non-streaming response
-        final_res_batch: list[Optional[PoolingRequestOutput]] = [None] * len(
+        final_res_batch: list[PoolingRequestOutput | None] = [None] * len(
             engine_prompts
         )
 
@@ -270,12 +270,12 @@ class ServingScores(OpenAIServing):
 
     async def _run_scoring(
         self,
-        data_1: Union[list[str], str, ScoreMultiModalParam],
-        data_2: Union[list[str], str, ScoreMultiModalParam],
-        request: Union[ScoreRequest, RerankRequest],
+        data_1: list[str] | str | ScoreMultiModalParam,
+        data_2: list[str] | str | ScoreMultiModalParam,
+        request: ScoreRequest | RerankRequest,
         request_id: str,
-        raw_request: Optional[Request] = None,
-    ) -> Union[list[PoolingRequestOutput], ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> list[PoolingRequestOutput] | ErrorResponse:
         lora_request = self._maybe_get_adapters(request)
 
         tokenizer = await self.engine_client.get_tokenizer()
@@ -339,8 +339,8 @@ class ServingScores(OpenAIServing):
     async def create_score(
         self,
         request: ScoreRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[ScoreResponse, ErrorResponse]:
+        raw_request: Request | None = None,
+    ) -> ScoreResponse | ErrorResponse:
         """
         Score API similar to Sentence Transformers cross encoder
 
@@ -377,8 +377,8 @@ class ServingScores(OpenAIServing):
             return self.create_error_response(str(e))
 
     async def do_rerank(
-        self, request: RerankRequest, raw_request: Optional[Request] = None
-    ) -> Union[RerankResponse, ErrorResponse]:
+        self, request: RerankRequest, raw_request: Request | None = None
+    ) -> RerankResponse | ErrorResponse:
         """
         Rerank API based on JinaAI's rerank API; implements the same
         API interface. Designed for compatibility with off-the-shelf
@@ -468,7 +468,7 @@ class ServingScores(OpenAIServing):
         final_res_batch: list[PoolingRequestOutput],
         request_id: str,
         model_name: str,
-        documents: Union[list[str], ScoreMultiModalParam],
+        documents: list[str] | ScoreMultiModalParam,
         top_n: int,
     ) -> RerankResponse:
         """
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index fb16d5ac690f1..39aae0cd04956 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Any, Final, Optional, Union
+from typing import Any, Final
 
 import jinja2
 from fastapi import Request
@@ -33,8 +33,8 @@ class OpenAIServingTokenization(OpenAIServing):
         engine_client: EngineClient,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
-        chat_template: Optional[str],
+        request_logger: RequestLogger | None,
+        chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
         trust_request_chat_template: bool = False,
         log_error_stack: bool = False,
@@ -54,7 +54,7 @@ class OpenAIServingTokenization(OpenAIServing):
         self,
         request: TokenizeRequest,
         raw_request: Request,
-    ) -> Union[TokenizeResponse, ErrorResponse]:
+    ) -> TokenizeResponse | ErrorResponse:
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
@@ -129,7 +129,7 @@ class OpenAIServingTokenization(OpenAIServing):
         self,
         request: DetokenizeRequest,
         raw_request: Request,
-    ) -> Union[DetokenizeResponse, ErrorResponse]:
+    ) -> DetokenizeResponse | ErrorResponse:
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
@@ -155,7 +155,7 @@ class OpenAIServingTokenization(OpenAIServing):
 
     async def get_tokenizer_info(
         self,
-    ) -> Union[TokenizerInfoResponse, ErrorResponse]:
+    ) -> TokenizerInfoResponse | ErrorResponse:
         """Get comprehensive tokenizer information."""
         try:
             tokenizer = await self.engine_client.get_tokenizer()
@@ -171,7 +171,7 @@ class OpenAIServingTokenization(OpenAIServing):
 @dataclass
 class TokenizerInfo:
     tokenizer: AnyTokenizer
-    chat_template: Optional[str]
+    chat_template: str | None
 
     def to_dict(self) -> dict[str, Any]:
         """Return the tokenizer configuration."""
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index f6b08bf11aacf..d043f55648d2c 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import AsyncGenerator
-from typing import Optional, Union
 
 from fastapi import Request
 
@@ -35,7 +34,7 @@ class OpenAIServingTranscription(OpenAISpeechToText):
         engine_client: EngineClient,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         return_tokens_as_token_ids: bool = False,
         log_error_stack: bool = False,
     ):
@@ -50,7 +49,7 @@ class OpenAIServingTranscription(OpenAISpeechToText):
 
     async def create_transcription(
         self, audio_data: bytes, request: TranscriptionRequest, raw_request: Request
-    ) -> Union[TranscriptionResponse, AsyncGenerator[str, None], ErrorResponse]:
+    ) -> TranscriptionResponse | AsyncGenerator[str, None] | ErrorResponse:
         """Transcription API similar to OpenAI's API.
 
         See https://platform.openai.com/docs/api-reference/audio/createTranscription
@@ -94,7 +93,7 @@ class OpenAIServingTranslation(OpenAISpeechToText):
         engine_client: EngineClient,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         return_tokens_as_token_ids: bool = False,
         log_error_stack: bool = False,
     ):
@@ -109,7 +108,7 @@ class OpenAIServingTranslation(OpenAISpeechToText):
 
     async def create_translation(
         self, audio_data: bytes, request: TranslationRequest, raw_request: Request
-    ) -> Union[TranslationResponse, AsyncGenerator[str, None], ErrorResponse]:
+    ) -> TranslationResponse | AsyncGenerator[str, None] | ErrorResponse:
         """Translation API similar to OpenAI's API.
 
         See https://platform.openai.com/docs/api-reference/audio/createTranslation
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index 2f518574242bf..fa6e962a1dd70 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -4,9 +4,9 @@ import asyncio
 import io
 import math
 import time
-from collections.abc import AsyncGenerator
+from collections.abc import AsyncGenerator, Callable
 from functools import cached_property
-from typing import Callable, Literal, Optional, TypeVar, Union, cast
+from typing import Literal, TypeAlias, TypeVar, cast
 
 import numpy as np
 from fastapi import Request
@@ -39,7 +39,7 @@ try:
 except ImportError:
     librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
 
-SpeechToTextResponse = Union[TranscriptionResponse, TranslationResponse]
+SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
 T = TypeVar("T", bound=SpeechToTextResponse)
 
 logger = init_logger(__name__)
@@ -54,7 +54,7 @@ class OpenAISpeechToText(OpenAIServing):
         engine_client: EngineClient,
         models: OpenAIServingModels,
         *,
-        request_logger: Optional[RequestLogger],
+        request_logger: RequestLogger | None,
         return_tokens_as_token_ids: bool = False,
         task_type: Literal["transcribe", "translate"] = "transcribe",
         log_error_stack: bool = False,
@@ -140,7 +140,7 @@ class OpenAISpeechToText(OpenAIServing):
         raw_request: Request,
         response_class: type[T],
         stream_generator_method: Callable[..., AsyncGenerator[str, None]],
-    ) -> Union[T, AsyncGenerator[str, None], ErrorResponse]:
+    ) -> T | AsyncGenerator[str, None] | ErrorResponse:
         """Base method for speech-to-text operations like transcription and
         translation."""
         error_check_ret = await self._check_model(request)
@@ -181,9 +181,7 @@ class OpenAISpeechToText(OpenAIServing):
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
-        list_result_generator: Optional[list[AsyncGenerator[RequestOutput, None]]] = (
-            None
-        )
+        list_result_generator: list[AsyncGenerator[RequestOutput, None]] | None = None
         try:
             # Unlike most decoder-only models, whisper generation length is not
             # constrained by the size of the input audio, which is mapped to a
@@ -252,13 +250,10 @@ class OpenAISpeechToText(OpenAIServing):
         request_metadata: RequestResponseMetadata,
         audio_duration_s: float,
         chunk_object_type: Literal["translation.chunk", "transcription.chunk"],
-        response_stream_choice_class: Union[
-            type[TranscriptionResponseStreamChoice],
-            type[TranslationResponseStreamChoice],
-        ],
-        stream_response_class: Union[
-            type[TranscriptionStreamResponse], type[TranslationStreamResponse]
-        ],
+        response_stream_choice_class: type[TranscriptionResponseStreamChoice]
+        | type[TranslationResponseStreamChoice],
+        stream_response_class: type[TranscriptionStreamResponse]
+        | type[TranslationStreamResponse],
     ) -> AsyncGenerator[str, None]:
         created_time = int(time.time())
         model_name = request.model
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index e6ee2fa777f81..3327ac99134fb 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -2,9 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from functools import cached_property
-from typing import Callable, Optional, Union
 
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
@@ -69,7 +68,7 @@ class ToolParser:
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Instance method that should be implemented for extracting tool calls
         from an incomplete response; for use when handling tool calls and
@@ -101,7 +100,7 @@ class ToolParserManager:
     def _register_module(
         cls,
         module: type,
-        module_name: Optional[Union[str, list[str]]] = None,
+        module_name: str | list[str] | None = None,
         force: bool = True,
     ) -> None:
         if not issubclass(module, ToolParser):
@@ -123,10 +122,10 @@ class ToolParserManager:
     @classmethod
     def register_module(
         cls,
-        name: Optional[Union[str, list[str]]] = None,
+        name: str | list[str] | None = None,
         force: bool = True,
-        module: Union[type, None] = None,
-    ) -> Union[type, Callable]:
+        module: type | None = None,
+    ) -> type | Callable:
         """
         Register module with the given name or name list. it can be used as a
         decoder(with module as None) or normal function(with module as not
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
index c6e8f1686e245..14fd5cf0941c6 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Union
 
 import regex as re
 
@@ -129,7 +128,7 @@ class DeepSeekV31ToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         logger.debug("delta_text: %s", delta_text)
         logger.debug("delta_token_ids: %s", delta_token_ids)
         # check to see if we should be streaming a tool call - is there a
@@ -272,7 +271,7 @@ class DeepSeekV31ToolParser(ToolParser):
             if not self.current_tool_name_sent:
                 if current_tool_call is None:
                     return None
-                function_name: Union[str, None] = current_tool_call.get("name")
+                function_name: str | None = current_tool_call.get("name")
                 if function_name:
                     self.current_tool_name_sent = True
                     return DeltaMessage(
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index e8a5d2e6dc133..b256560fb4beb 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Union
 
 import regex as re
 
@@ -129,7 +128,7 @@ class DeepSeekV3ToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         logger.debug("delta_text: %s", delta_text)
         logger.debug("delta_token_ids: %s", delta_token_ids)
         # check to see if we should be streaming a tool call - is there a
@@ -272,7 +271,7 @@ class DeepSeekV3ToolParser(ToolParser):
             if not self.current_tool_name_sent:
                 if current_tool_call is None:
                     return None
-                function_name: Union[str, None] = current_tool_call.get("name")
+                function_name: str | None = current_tool_call.get("name")
                 if function_name:
                     self.current_tool_name_sent = True
                     return DeltaMessage(
diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
index 1d7d7d3f8629d..5081b38240ce6 100644
--- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
@@ -4,7 +4,7 @@
 import ast
 import json
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 
@@ -66,7 +66,7 @@ class Glm4MoeModelToolParser(ToolParser):
         def _is_string_type(
             tool_name: str,
             arg_name: str,
-            tools: Optional[list[ChatCompletionToolsParam]],
+            tools: list[ChatCompletionToolsParam] | None,
         ) -> bool:
             if tools is None:
                 return False
@@ -144,7 +144,7 @@ class Glm4MoeModelToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         self._buffer += delta_text
         cur_text = self._buffer
         start_idx = cur_text.find(self.tool_call_start_token)
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index c42b358b1e34b..c5246685f4071 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -4,7 +4,6 @@
 import json
 from collections.abc import Sequence
 from json import JSONDecoder
-from typing import Union
 
 import partial_json_parser
 import regex as re
@@ -121,7 +120,7 @@ class Granite20bFCToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         if len(current_text) < len(self.bot_token) and self.bot_token.startswith(
             current_text
         ):
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index 989973923ae58..cc1f500342353 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -3,7 +3,6 @@
 
 import json
 from collections.abc import Sequence
-from typing import Union
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -108,7 +107,7 @@ class GraniteToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         start_idx = consume_space(0, current_text)
         if current_text[start_idx:].startswith(self.bot_token):
             start_idx = consume_space(start_idx + len(self.bot_token), current_text)
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 4529eb51796e1..ca3239e94377f 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -3,7 +3,6 @@
 
 import json
 from collections.abc import Sequence
-from typing import Union
 
 import partial_json_parser
 import regex as re
@@ -181,7 +180,7 @@ class Hermes2ProToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         # 1. All tokens are parsed based on _text, not token_ids.
         # 2. All incoming text data is processed by the tool_call_delta_buffer
         #    function for buffering before being used for parsing.
@@ -333,7 +332,7 @@ class Hermes2ProToolParser(ToolParser):
             if not self.current_tool_name_sent:
                 if current_tool_call is None:
                     return None
-                function_name: Union[str, None] = current_tool_call.get("name")
+                function_name: str | None = current_tool_call.get("name")
                 if function_name:
                     self.current_tool_name_sent = True
                     return DeltaMessage(
diff --git a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
index 1855d69adb217..b32e6e39b3e5c 100644
--- a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
@@ -4,7 +4,7 @@
 
 import json
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 
@@ -73,7 +73,7 @@ class HunyuanA13BToolParser(ToolParser):
 
     def preprocess_model_output(
         self, model_output: str
-    ) -> tuple[Optional[str], Optional[str]]:
+    ) -> tuple[str | None, str | None]:
         # find the location tool call
         for match in self.answer_tool_calls_pattern.finditer(model_output):
             start, end = match.span()
@@ -176,7 +176,7 @@ class HunyuanA13BToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Extract tool calls for streaming mode.
         """
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 9adaea297b05f..958aa3b98fafb 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -3,7 +3,6 @@
 
 import json
 from collections.abc import Sequence
-from typing import Union
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -59,7 +58,7 @@ class Internlm2ToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         if "<|action_start|>" not in current_text:
             self.position = len(current_text)
             return DeltaMessage(content=delta_text)
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 1ae3e0da33513..ca0faabada207 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -3,7 +3,6 @@
 
 import json
 from collections.abc import Sequence
-from typing import Union
 
 import partial_json_parser
 import regex as re
@@ -129,7 +128,7 @@ class JambaToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         # if the tool call token is not in the tokens generated so far, append
         # output to contents since it's not a tool
         if self.tool_calls_start_token not in current_text:
@@ -190,7 +189,7 @@ class JambaToolParser(ToolParser):
                 # auto-generated due to JSON completions, but wasn't
                 # streamed to the client yet.
                 if self.current_tool_id >= 0:
-                    diff: Union[str, None] = current_tool_call.get("arguments")
+                    diff: str | None = current_tool_call.get("arguments")
 
                     if diff:
                         diff = json.dumps(diff, ensure_ascii=False).replace(
diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
index a2eff21a44667..98a52ddd60d68 100644
--- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
@@ -3,7 +3,6 @@
 # code modified from deepseekv3_tool_parser.py
 
 from collections.abc import Sequence
-from typing import Union
 
 import regex as re
 
@@ -131,7 +130,7 @@ class KimiK2ToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         logger.debug("delta_text: %s", delta_text)
         logger.debug("delta_token_ids: %s", delta_token_ids)
         # check to see if we should be streaming a tool call - is there a
@@ -278,7 +277,7 @@ class KimiK2ToolParser(ToolParser):
             if not self.current_tool_name_sent:
                 if current_tool_call is None:
                     return None
-                function_name: Union[str, None] = current_tool_call.get("name")
+                function_name: str | None = current_tool_call.get("name")
                 tool_id = current_tool_call.get("id")
                 if function_name:
                     self.current_tool_name_sent = True
diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
index 162675efbc9a7..dd622b69525de 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
@@ -3,7 +3,7 @@
 import ast
 import json
 from collections.abc import Sequence
-from typing import Any, Union
+from typing import Any
 
 import regex as re
 from transformers import PreTrainedTokenizerBase
@@ -128,7 +128,7 @@ class Llama4PythonicToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         if not current_text.startswith("[") and not current_text.startswith(
             "<|python_start|>"
         ):
@@ -245,7 +245,7 @@ def _handle_single_tool(call: ast.Call) -> ToolCall:
     )
 
 
-def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
+def _make_valid_python(text: str) -> tuple[str, str] | None:
     bracket_stack = []
     for index, char in enumerate(text):
         if char in {"[", "(", "{"}:
@@ -317,7 +317,7 @@ def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
 
 def _compute_tool_delta(
     previously_sent_args: str, new_call: ToolCall, index: int, withheld_suffix: str
-) -> Union[DeltaToolCall, None]:
+) -> DeltaToolCall | None:
     new_call_args = new_call.function.arguments
     if withheld_suffix:
         assert new_call_args.endswith(withheld_suffix)
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 4d5ef5ed64aa2..8c7b3cefb200e 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -3,7 +3,6 @@
 
 import json
 from collections.abc import Sequence
-from typing import Union
 
 import partial_json_parser
 import regex as re
@@ -134,7 +133,7 @@ class Llama3JsonToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         if not (
             current_text.startswith(self.bot_token) or current_text.startswith("{")
         ):
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
index 0b83fd237a6a7..4b12bf68b3670 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -3,7 +3,7 @@
 
 import json
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 
@@ -509,7 +509,7 @@ class MinimaxToolParser(ToolParser):
 
     def _get_current_tool_content(
         self, text: str, tool_index: int
-    ) -> tuple[Optional[str], Optional[str]]:
+    ) -> tuple[str | None, str | None]:
         """
         Get the content of a specific tool by index.
 
@@ -545,7 +545,7 @@ class MinimaxToolParser(ToolParser):
 
     def _handle_tool_name_streaming(
         self, tool_content: str, tool_count: int
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Handle streaming of tool names.
 
@@ -595,7 +595,7 @@ class MinimaxToolParser(ToolParser):
 
     def _handle_tool_args_streaming(
         self, tool_content: str, tool_count: int
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Handle streaming of tool arguments.
 
@@ -702,7 +702,7 @@ class MinimaxToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         self._update_thinking_state(current_text)
 
         if self.in_thinking_tag:
@@ -776,7 +776,7 @@ class MinimaxToolParser(ToolParser):
             )
             return None
 
-    def _find_tool_start_outside_thinking(self, current_text: str) -> Optional[int]:
+    def _find_tool_start_outside_thinking(self, current_text: str) -> int | None:
         """
         Find the start position of tool calls outside of thinking tags.
 
@@ -809,7 +809,7 @@ class MinimaxToolParser(ToolParser):
 
     def _extract_content_before_tools(
         self, current_text: str, delta_text: str, tool_start: int
-    ) -> Optional[str]:
+    ) -> str | None:
         """
         Extract content that appears before tool calls.
 
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index b3b8960276bcc..12b3d7bea8a42 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -5,7 +5,6 @@ import json
 from collections.abc import Sequence
 from random import choices
 from string import ascii_letters, digits
-from typing import Union
 
 import partial_json_parser
 import regex as re
@@ -194,7 +193,7 @@ class MistralToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         # if the tool call token is not in the tokens generated so far, append
         # output to contents since it's not a tool
         if self.bot_token not in current_text:
@@ -252,7 +251,7 @@ class MistralToolParser(ToolParser):
                 # auto-generated due to JSON completions, but wasn't
                 # streamed to the client yet.
                 if self.current_tool_id >= 0:
-                    diff: Union[str, None] = current_tool_call.get("arguments")
+                    diff: str | None = current_tool_call.get("arguments")
 
                     if diff:
                         diff = json.dumps(diff, ensure_ascii=False).replace(
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
index 8d7cbbfba649d..f44876943ac28 100644
--- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import json
 from collections.abc import Sequence
 from typing import TYPE_CHECKING
@@ -22,13 +20,15 @@ from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     from vllm.transformers_utils.tokenizer import AnyTokenizer
+else:
+    AnyTokenizer = object
 
 logger = init_logger(__name__)
 
 
 @ToolParserManager.register_module("openai")
 class OpenAIToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: "AnyTokenizer"):
         super().__init__(tokenizer)
 
     def extract_tool_calls(
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index 114987e5600b2..a8387ba1494df 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -3,7 +3,7 @@
 
 import json
 from collections.abc import Sequence
-from typing import Any, Optional
+from typing import Any
 
 import regex as re
 from transformers import PreTrainedTokenizerBase
@@ -118,5 +118,5 @@ class Phi4MiniJsonToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Optional[DeltaMessage]:
+    ) -> DeltaMessage | None:
         return None
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 272068a6f0ac7..4945e7b5ab20a 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -4,7 +4,7 @@
 import ast
 import json
 from collections.abc import Sequence
-from typing import Any, Union
+from typing import Any
 
 import regex as re
 from transformers import PreTrainedTokenizerBase
@@ -124,7 +124,7 @@ class PythonicToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         if not current_text.startswith("["):
             return DeltaMessage(content=delta_text)
 
@@ -236,7 +236,7 @@ def _handle_single_tool(call: ast.Call) -> ToolCall:
     )
 
 
-def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
+def _make_valid_python(text: str) -> tuple[str, str] | None:
     bracket_stack = []
     for index, char in enumerate(text):
         if char in {"[", "(", "{"}:
@@ -308,7 +308,7 @@ def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
 
 def _compute_tool_delta(
     previously_sent_args: str, new_call: ToolCall, index: int, withheld_suffix: str
-) -> Union[DeltaToolCall, None]:
+) -> DeltaToolCall | None:
     new_call_args = new_call.function.arguments
     if withheld_suffix:
         assert new_call_args.endswith(withheld_suffix)
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
index a41ca30bf5276..ad56972e6387e 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -4,7 +4,7 @@ import ast
 import json
 import uuid
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 
@@ -36,7 +36,7 @@ class Qwen3CoderToolParser(ToolParser):
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
         # Override base class type - we use string IDs for tool calls
-        self.current_tool_id: Optional[str] = None  # type: ignore
+        self.current_tool_id: str | None = None  # type: ignore
         self.streamed_args_for_tool: list[str] = []
 
         # Sentinel tokens for streaming mode
@@ -110,7 +110,7 @@ class Qwen3CoderToolParser(ToolParser):
         self.streaming_request = None
 
     def _get_arguments_config(
-        self, func_name: str, tools: Optional[list[ChatCompletionToolsParam]]
+        self, func_name: str, tools: list[ChatCompletionToolsParam] | None
     ) -> dict:
         """Extract argument configuration for a function."""
         if tools is None:
@@ -240,8 +240,8 @@ class Qwen3CoderToolParser(ToolParser):
             return param_value
 
     def _parse_xml_function_call(
-        self, function_call_str: str, tools: Optional[list[ChatCompletionToolsParam]]
-    ) -> Optional[ToolCall]:
+        self, function_call_str: str, tools: list[ChatCompletionToolsParam] | None
+    ) -> ToolCall | None:
         # Extract function name
         end_index = function_call_str.index(">")
         function_name = function_call_str[:end_index]
@@ -349,7 +349,7 @@ class Qwen3CoderToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         # Store request for type conversion
         if not previous_text:
             self._reset_streaming_state()
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
index 1b7e4fec316eb..2c5b0b6a85f76 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
@@ -4,7 +4,7 @@ import ast
 import json
 import uuid
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 from xml.parsers.expat import ParserCreate
 
 import regex as re
@@ -39,7 +39,7 @@ class StreamingXMLToolCallParser:
         self.reset_streaming_state()
 
         # Tool configuration information
-        self.tools: Union[list[ChatCompletionToolsParam], None] = None
+        self.tools: list[ChatCompletionToolsParam] | None = None
         self.tool_call_start_token: str = "<tool_call>"
         self.tool_call_end_token: str = "</tool_call>"
         self.function_start_token: str = "<function="
@@ -341,7 +341,7 @@ class StreamingXMLToolCallParser:
         # Skip blank content
         return not element
 
-    def _find_next_complete_element(self, start_pos: int) -> tuple[Optional[str], int]:
+    def _find_next_complete_element(self, start_pos: int) -> tuple[str | None, int]:
         """
         Find next complete XML element from specified position
 
@@ -584,7 +584,7 @@ class StreamingXMLToolCallParser:
         """Emit Delta response (streaming output)"""
         self.deltas.append(delta)
 
-    def _auto_close_open_parameter_if_needed(self, incoming_tag: Optional[str] = None):
+    def _auto_close_open_parameter_if_needed(self, incoming_tag: str | None = None):
         """Before starting to process new elements,
         if there are unclosed tags from before,
         automatically complete their endings to the parser.
@@ -953,7 +953,7 @@ class StreamingXMLToolCallParser:
         self.parser.EndElementHandler = self._end_element
         self.parser.CharacterDataHandler = self._char_data
 
-    def set_tools(self, tools: Union[list[ChatCompletionToolsParam], None]):
+    def set_tools(self, tools: list[ChatCompletionToolsParam] | None):
         """Set tool configuration information"""
         self.tools = tools
 
@@ -961,7 +961,7 @@ class StreamingXMLToolCallParser:
         """Generate unique call ID"""
         return f"call_{uuid.uuid4().hex[:24]}"
 
-    def _extract_function_name(self, name: str, attrs: dict[str, str]) -> Optional[str]:
+    def _extract_function_name(self, name: str, attrs: dict[str, str]) -> str | None:
         """Extract function name from various formats"""
         if attrs and "name" in attrs:
             return attrs["name"]
@@ -973,9 +973,7 @@ class StreamingXMLToolCallParser:
 
         return None
 
-    def _extract_parameter_name(
-        self, name: str, attrs: dict[str, str]
-    ) -> Optional[str]:
+    def _extract_parameter_name(self, name: str, attrs: dict[str, str]) -> str | None:
         """Extract parameter name from various formats"""
         if attrs and "name" in attrs:
             return attrs["name"]
@@ -1218,7 +1216,7 @@ class Qwen3XMLToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         if not previous_text:
             self.parser.reset_streaming_state()
             if request:
diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
index 2e7bd0d1d344d..f50a2df53bc04 100644
--- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
@@ -7,7 +7,7 @@ import ast
 import json
 import uuid
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 
@@ -109,8 +109,8 @@ class SeedOssToolParser(ToolParser):
         self.json_closed = False
 
     def _parse_xml_function_call(
-        self, function_call_str: str, tools: Optional[list[ChatCompletionToolsParam]]
-    ) -> Optional[ToolCall]:
+        self, function_call_str: str, tools: list[ChatCompletionToolsParam] | None
+    ) -> ToolCall | None:
         def get_arguments_config(func_name: str) -> dict:
             if tools is None:
                 return {}
@@ -357,7 +357,7 @@ class SeedOssToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         # If no delta text, return None unless
         # it's an EOS token after tool calls
         if not delta_text:
diff --git a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
index 34bd372b2060b..0a80c5ccc354d 100644
--- a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
@@ -4,7 +4,7 @@
 import contextlib
 import json
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 
@@ -58,7 +58,7 @@ class Step3ToolParser(ToolParser):
     @staticmethod
     def _parse_steptml_invoke(
         action_text: str,
-    ) -> tuple[Optional[str], Optional[dict[str, str]]]:
+    ) -> tuple[str | None, dict[str, str] | None]:
         func_name_match = re.search(r'<steptml:invoke name="([^"]+)">', action_text)
         if not func_name_match:
             return None, None
@@ -117,7 +117,7 @@ class Step3ToolParser(ToolParser):
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         # The main loop processes the stream from the last known position.
         while True:
             if self.position >= len(current_text):
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 98c9cbbbd376e..4f1213b097306 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -5,7 +5,7 @@ import asyncio
 import io
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Annotated, Optional, Union
+from typing import Annotated
 
 import pybase64
 import torch
@@ -24,25 +24,25 @@ from vllm.utils import AsyncMicrobatchTokenizer
 class RenderConfig:
     """Configuration to control how prompts are prepared."""
 
-    max_length: Optional[int] = None
+    max_length: int | None = None
     """Maximum allowable total input token length. If provided,
     token inputs longer than this raise ``ValueError``."""
 
-    truncate_prompt_tokens: Optional[int] = None
+    truncate_prompt_tokens: int | None = None
     """Number of tokens to keep. ``None`` means no truncation.
     ``0`` yields an empty list (and skips embeds).
     ``-1`` maps to ``model_config.max_model_len``."""
 
-    add_special_tokens: Optional[bool] = True
+    add_special_tokens: bool | None = True
     """Whether to add model-specific special tokens during tokenization."""
 
-    cache_salt: Optional[str] = None
+    cache_salt: str | None = None
     """String to disambiguate prefix cache entries."""
 
-    needs_detokenization: Optional[bool] = False
+    needs_detokenization: bool | None = False
     """If True, detokenize IDs back to text for inclusion in outputs."""
 
-    def verify_truncate_prompt_tokens(self, model_config: ModelConfig) -> Optional[int]:
+    def verify_truncate_prompt_tokens(self, model_config: ModelConfig) -> int | None:
         """Validate and normalize `truncate_prompt_tokens` parameter."""
         truncate_prompt_tokens = self.truncate_prompt_tokens
         if truncate_prompt_tokens is None:
@@ -85,7 +85,7 @@ class BaseRenderer(ABC):
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: Optional[AnyTokenizer] = None,
+        tokenizer: AnyTokenizer | None = None,
     ):
         super().__init__()
         self.model_config = model_config
@@ -95,7 +95,7 @@ class BaseRenderer(ABC):
     async def render_prompt(
         self,
         *,
-        prompt_or_prompts: Union[str, list[str], list[int], list[list[int]]],
+        prompt_or_prompts: str | list[str] | list[int] | list[list[int]],
         config: RenderConfig,
     ) -> list[EngineTokensPrompt]:
         """
@@ -126,12 +126,10 @@ class BaseRenderer(ABC):
     async def render_prompt_and_embeds(
         self,
         *,
-        prompt_or_prompts: Optional[
-            Union[str, list[str], list[int], list[list[int]]]
-        ] = None,
-        prompt_embeds: Optional[Union[bytes, list[bytes]]] = None,
+        prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None,
+        prompt_embeds: bytes | list[bytes] | None = None,
         config: RenderConfig,
-    ) -> list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]:
+    ) -> list[EngineTokensPrompt | EngineEmbedsPrompt]:
         """
         Convert text/token and/or base64-encoded embeddings inputs into
         engine-ready prompt objects using a unified RenderConfig.
@@ -161,9 +159,9 @@ class BaseRenderer(ABC):
     @classmethod
     def load_prompt_embeds(
         cls,
-        prompt_embeds: Union[bytes, list[bytes]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=0)]] = None,
-        cache_salt: Optional[str] = None,
+        prompt_embeds: bytes | list[bytes],
+        truncate_prompt_tokens: Annotated[int, Field(ge=0)] | None = None,
+        cache_salt: str | None = None,
     ) -> list[EngineEmbedsPrompt]:
         """Load and validate base64-encoded embeddings into prompt objects."""
 
@@ -199,19 +197,18 @@ class CompletionRenderer(BaseRenderer):
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: Optional[AnyTokenizer] = None,
-        async_tokenizer_pool: Optional[
-            dict[AnyTokenizer, AsyncMicrobatchTokenizer]
-        ] = None,
+        tokenizer: AnyTokenizer | None = None,
+        async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer]
+        | None = None,
     ):
         super().__init__(model_config, tokenizer)
         self.async_tokenizer_pool = async_tokenizer_pool
-        self.async_tokenizer: Optional[AsyncMicrobatchTokenizer] = None
+        self.async_tokenizer: AsyncMicrobatchTokenizer | None = None
 
     async def render_prompt(
         self,
         *,
-        prompt_or_prompts: Union[str, list[str], list[int], list[list[int]]],
+        prompt_or_prompts: str | list[str] | list[int] | list[list[int]],
         config: RenderConfig,
     ) -> list[EngineTokensPrompt]:
         """Implementation of prompt rendering for completion-style requests.
@@ -237,12 +234,10 @@ class CompletionRenderer(BaseRenderer):
     async def render_prompt_and_embeds(
         self,
         *,
-        prompt_or_prompts: Optional[
-            Union[str, list[str], list[int], list[list[int]]]
-        ] = None,
-        prompt_embeds: Optional[Union[bytes, list[bytes]]] = None,
+        prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None,
+        prompt_embeds: bytes | list[bytes] | None = None,
         config: RenderConfig,
-    ) -> list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]:
+    ) -> list[EngineTokensPrompt | EngineEmbedsPrompt]:
         """
         Render text/token prompts and/or precomputed embedding prompts. At
         least one of `prompt_or_prompts` or `prompt_embeds` must be provided.
@@ -251,7 +246,7 @@ class CompletionRenderer(BaseRenderer):
         if truncate_prompt_tokens == 0:
             return []
 
-        rendered: list[Union[EngineTokensPrompt, EngineEmbedsPrompt]] = []
+        rendered: list[EngineTokensPrompt | EngineEmbedsPrompt] = []
 
         if prompt_embeds is not None:
             rendered.extend(
@@ -271,7 +266,7 @@ class CompletionRenderer(BaseRenderer):
         return rendered
 
     def _maybe_apply_truncation(
-        self, token_ids: list[int], truncate_prompt_tokens: Optional[int]
+        self, token_ids: list[int], truncate_prompt_tokens: int | None
     ) -> list[int]:
         """Apply truncation to token sequence."""
         if truncate_prompt_tokens is None:
@@ -283,9 +278,9 @@ class CompletionRenderer(BaseRenderer):
 
     async def _create_prompt(
         self,
-        prompt_input: Union[EngineTextPrompt, EngineTokensPrompt],
+        prompt_input: EngineTextPrompt | EngineTokensPrompt,
         config: RenderConfig,
-        truncate_prompt_tokens: Optional[int],
+        truncate_prompt_tokens: int | None,
     ) -> EngineTokensPrompt:
         prompt, prompt_token_ids, _ = get_prompt_components(prompt_input)
 
@@ -315,10 +310,10 @@ class CompletionRenderer(BaseRenderer):
     async def _create_prompt_from_text(
         self,
         text: str,
-        max_length: Optional[int],
-        truncate_prompt_tokens: Optional[int],
-        add_special_tokens: Optional[bool],
-        cache_salt: Optional[str],
+        max_length: int | None,
+        truncate_prompt_tokens: int | None,
+        add_special_tokens: bool | None,
+        cache_salt: str | None,
     ) -> EngineTokensPrompt:
         """Tokenize text input asynchronously."""
         async_tokenizer = self._get_async_tokenizer()
@@ -348,10 +343,10 @@ class CompletionRenderer(BaseRenderer):
     async def _create_prompt_from_token_ids(
         self,
         token_ids: list[int],
-        max_length: Optional[int],
-        truncate_prompt_tokens: Optional[int],
-        cache_salt: Optional[str],
-        needs_detokenization: Optional[bool] = False,
+        max_length: int | None,
+        truncate_prompt_tokens: int | None,
+        cache_salt: str | None,
+        needs_detokenization: bool | None = False,
     ) -> EngineTokensPrompt:
         """Optionally detokenize token IDs and build a tokens prompt."""
         token_ids = self._maybe_apply_truncation(token_ids, truncate_prompt_tokens)
@@ -391,9 +386,9 @@ class CompletionRenderer(BaseRenderer):
     def _create_tokens_prompt(
         self,
         token_ids: list[int],
-        max_length: Optional[int] = None,
-        cache_salt: Optional[str] = None,
-        prompt: Optional[str] = None,
+        max_length: int | None = None,
+        cache_salt: str | None = None,
+        prompt: str | None = None,
     ) -> EngineTokensPrompt:
         """Create validated EngineTokensPrompt."""
         if max_length is not None and len(token_ids) > max_length:
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 1fb56d246debe..cd62cfe5448c4 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional, Union, cast
+from typing import Any, TypeAlias, cast
 
 from torch.nn import CosineSimilarity
-from typing_extensions import Required, TypeAlias, TypedDict
+from typing_extensions import Required, TypedDict
 
 from vllm.config import ModelConfig
 from vllm.entrypoints.chat_utils import (
@@ -25,9 +25,9 @@ from vllm.transformers_utils.tokenizer import (
     PreTrainedTokenizerFast,
 )
 
-ScoreContentPartParam: TypeAlias = Union[
-    ChatCompletionContentPartImageParam, ChatCompletionContentPartImageEmbedsParam
-]
+ScoreContentPartParam: TypeAlias = (
+    ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam
+)
 
 
 class ScoreMultiModalParam(TypedDict, total=False):
@@ -45,12 +45,12 @@ class ScoreMultiModalParam(TypedDict, total=False):
 
 
 def _cosine_similarity(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
     embed_1: list[PoolingRequestOutput],
     embed_2: list[PoolingRequestOutput],
 ) -> list[PoolingRequestOutput]:
     scorer = CosineSimilarity(0)
-    scores: Union[list[PoolingRequestOutput]] = []
+    scores: list[PoolingRequestOutput] = []
 
     for emb_1, emb_2 in zip(embed_1, embed_2):
         pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data)
@@ -74,8 +74,8 @@ def _cosine_similarity(
 
 
 def _validate_score_input_lens(
-    data_1: Union[list[str], list[ScoreContentPartParam]],
-    data_2: Union[list[str], list[ScoreContentPartParam]],
+    data_1: list[str] | list[ScoreContentPartParam],
+    data_2: list[str] | list[ScoreContentPartParam],
 ):
     len_1 = len(data_1)
     len_2 = len(data_2)
@@ -89,18 +89,18 @@ def _validate_score_input_lens(
 
 
 def parse_score_data(
-    data_1: Union[str, ScoreContentPartParam],
-    data_2: Union[str, ScoreContentPartParam],
+    data_1: str | ScoreContentPartParam,
+    data_2: str | ScoreContentPartParam,
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
-) -> tuple[str, str, Optional[MultiModalDataDict]]:
+) -> tuple[str, str, MultiModalDataDict | None]:
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
 
     content_1 = _parse_score_content(data_1, mm_tracker)
 
     content_2 = _parse_score_content(data_2, mm_tracker)
 
-    def ensure_str(content: Optional[_ContentPart]) -> str:
+    def ensure_str(content: _ContentPart | None) -> str:
         if content is not None and isinstance(content, str):
             return cast(str, content)
         else:
@@ -113,9 +113,9 @@ def parse_score_data(
 
 
 def _parse_score_content(
-    data: Union[str, ScoreContentPartParam],
+    data: str | ScoreContentPartParam,
     mm_tracker: BaseMultiModalItemTracker,
-) -> Optional[_ContentPart]:
+) -> _ContentPart | None:
     if isinstance(data, str):
         data = ChatCompletionContentPartTextParam(type="text", text=data)
 
@@ -182,8 +182,8 @@ def get_score_prompt(
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
     tokenization_kwargs: dict[str, Any],
-    data_1: Union[str, ScoreContentPartParam],
-    data_2: Union[str, ScoreContentPartParam],
+    data_1: str | ScoreContentPartParam,
+    data_2: str | ScoreContentPartParam,
 ) -> tuple[str, TokensPrompt]:
     prompt_1, prompt_2, mm_data = parse_score_data(
         data_1,
diff --git a/vllm/entrypoints/ssl.py b/vllm/entrypoints/ssl.py
index ff0dd1bbfc6bb..4d947bc620cf1 100644
--- a/vllm/entrypoints/ssl.py
+++ b/vllm/entrypoints/ssl.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+from collections.abc import Callable
 from ssl import SSLContext
-from typing import Callable, Optional
 
 from watchfiles import Change, awatch
 
@@ -20,9 +20,9 @@ class SSLCertRefresher:
     def __init__(
         self,
         ssl_context: SSLContext,
-        key_path: Optional[str] = None,
-        cert_path: Optional[str] = None,
-        ca_path: Optional[str] = None,
+        key_path: str | None = None,
+        cert_path: str | None = None,
+        ca_path: str | None = None,
     ) -> None:
         self.ssl = ssl_context
         self.key_path = key_path
diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py
index b3dceecc15834..0d83031ef69fe 100644
--- a/vllm/entrypoints/tool_server.py
+++ b/vllm/entrypoints/tool_server.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from contextlib import AbstractAsyncContextManager, asynccontextmanager
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 from openai_harmony import ToolDescription, ToolNamespaceConfig
 
@@ -80,7 +80,7 @@ class ToolServer(ABC):
         pass
 
     @abstractmethod
-    def get_tool_description(self, tool_name: str) -> Optional[ToolNamespaceConfig]:
+    def get_tool_description(self, tool_name: str) -> ToolNamespaceConfig | None:
         """
         Return the tool description for the given tool name.
         If the tool is not supported, return None.
@@ -89,7 +89,7 @@ class ToolServer(ABC):
 
     @abstractmethod
     def new_session(
-        self, tool_name: str, session_id: str, headers: Optional[dict[str, str]] = None
+        self, tool_name: str, session_id: str, headers: dict[str, str] | None = None
     ) -> AbstractAsyncContextManager[Any]:
         """
         Create a session for the tool.
@@ -152,7 +152,7 @@ class MCPToolServer(ToolServer):
 
     @asynccontextmanager
     async def new_session(
-        self, tool_name: str, session_id: str, headers: Optional[dict[str, str]] = None
+        self, tool_name: str, session_id: str, headers: dict[str, str] | None = None
     ):
         from mcp import ClientSession
         from mcp.client.sse import sse_client
@@ -190,7 +190,7 @@ class DemoToolServer(ToolServer):
     def has_tool(self, tool_name: str) -> bool:
         return tool_name in self.tools
 
-    def get_tool_description(self, tool_name: str) -> Optional[ToolNamespaceConfig]:
+    def get_tool_description(self, tool_name: str) -> ToolNamespaceConfig | None:
         if tool_name not in self.tools:
             return None
         if tool_name == "browser":
@@ -202,7 +202,7 @@ class DemoToolServer(ToolServer):
 
     @asynccontextmanager
     async def new_session(
-        self, tool_name: str, session_id: str, headers: Optional[dict[str, str]] = None
+        self, tool_name: str, session_id: str, headers: dict[str, str] | None = None
     ):
         if tool_name not in self.tools:
             raise KeyError(f"Tool '{tool_name}' is not supported")
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index c97ca6538814d..1504705cf0e2b 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -6,7 +6,7 @@ import dataclasses
 import functools
 import os
 from argparse import Namespace
-from typing import Any, Optional, Union
+from typing import Any
 
 from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
@@ -164,9 +164,9 @@ def cli_env_setup():
 
 def _validate_truncation_size(
     max_model_len: int,
-    truncate_prompt_tokens: Optional[int],
-    tokenization_kwargs: Optional[dict[str, Any]] = None,
-) -> Optional[int]:
+    truncate_prompt_tokens: int | None,
+    tokenization_kwargs: dict[str, Any] | None = None,
+) -> int | None:
     if truncate_prompt_tokens is not None:
         if truncate_prompt_tokens <= -1:
             truncate_prompt_tokens = max_model_len
@@ -191,7 +191,7 @@ def _validate_truncation_size(
 
 def get_max_tokens(
     max_model_len: int,
-    request: Union[ChatCompletionRequest, CompletionRequest],
+    request: ChatCompletionRequest | CompletionRequest,
     input_length: int,
     default_sampling_params: dict,
 ) -> int:
@@ -211,7 +211,7 @@ def get_max_tokens(
     )
 
 
-def log_non_default_args(args: Union[Namespace, EngineArgs]):
+def log_non_default_args(args: Namespace | EngineArgs):
     non_default_args = {}
 
     # Handle Namespace
diff --git a/vllm/envs.py b/vllm/envs.py
index ab8548cf50661..97076bec11b81 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -6,27 +6,28 @@ import json
 import os
 import sys
 import tempfile
-from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, Literal
 
 if TYPE_CHECKING:
     VLLM_HOST_IP: str = ""
-    VLLM_PORT: Optional[int] = None
+    VLLM_PORT: int | None = None
     VLLM_RPC_BASE_PATH: str = tempfile.gettempdir()
     VLLM_USE_MODELSCOPE: bool = False
     VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
-    VLLM_NCCL_SO_PATH: Optional[str] = None
-    LD_LIBRARY_PATH: Optional[str] = None
+    VLLM_NCCL_SO_PATH: str | None = None
+    LD_LIBRARY_PATH: str | None = None
     VLLM_USE_TRITON_FLASH_ATTN: bool = True
     VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
-    VLLM_FLASH_ATTN_VERSION: Optional[int] = None
+    VLLM_FLASH_ATTN_VERSION: int | None = None
     LOCAL_RANK: int = 0
-    CUDA_VISIBLE_DEVICES: Optional[str] = None
+    CUDA_VISIBLE_DEVICES: str | None = None
     VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
-    VLLM_API_KEY: Optional[str] = None
-    S3_ACCESS_KEY_ID: Optional[str] = None
-    S3_SECRET_ACCESS_KEY: Optional[str] = None
-    S3_ENDPOINT_URL: Optional[str] = None
-    VLLM_MODEL_REDIRECT_PATH: Optional[str] = None
+    VLLM_API_KEY: str | None = None
+    S3_ACCESS_KEY_ID: str | None = None
+    S3_SECRET_ACCESS_KEY: str | None = None
+    S3_ENDPOINT_URL: str | None = None
+    VLLM_MODEL_REDIRECT_PATH: str | None = None
     VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
     VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
     VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
@@ -38,16 +39,16 @@ if TYPE_CHECKING:
     VLLM_LOGGING_LEVEL: str = "INFO"
     VLLM_LOGGING_PREFIX: str = ""
     VLLM_LOGGING_STREAM: str = "ext://sys.stdout"
-    VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
-    VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
+    VLLM_LOGGING_CONFIG_PATH: str | None = None
+    VLLM_LOGITS_PROCESSOR_THREADS: int | None = None
     VLLM_LOG_STATS_INTERVAL: float = 10.0
     VLLM_TRACE_FUNCTION: int = 0
-    VLLM_ATTENTION_BACKEND: Optional[str] = None
-    VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
-    VLLM_PP_LAYER_PARTITION: Optional[str] = None
-    VLLM_CPU_KVCACHE_SPACE: Optional[int] = 0
+    VLLM_ATTENTION_BACKEND: str | None = None
+    VLLM_USE_FLASHINFER_SAMPLER: bool | None = None
+    VLLM_PP_LAYER_PARTITION: str | None = None
+    VLLM_CPU_KVCACHE_SPACE: int | None = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
-    VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None
+    VLLM_CPU_NUM_OF_RESERVED_CPU: int | None = None
     VLLM_CPU_MOE_PREPACK: bool = True
     VLLM_CPU_SGL_KERNEL: bool = False
     VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
@@ -73,20 +74,20 @@ if TYPE_CHECKING:
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
     VLLM_MAIN_CUDA_VERSION: str = "12.8"
-    MAX_JOBS: Optional[str] = None
-    NVCC_THREADS: Optional[str] = None
+    MAX_JOBS: str | None = None
+    NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
     VLLM_DOCKER_BUILD_CONTEXT: bool = False
     VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
-    CMAKE_BUILD_TYPE: Optional[Literal["Debug", "Release", "RelWithDebInfo"]] = None
+    CMAKE_BUILD_TYPE: Literal["Debug", "Release", "RelWithDebInfo"] | None = None
     VERBOSE: bool = False
     VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
     VLLM_RPC_TIMEOUT: int = 10000  # ms
     VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
-    VLLM_PLUGINS: Optional[list[str]] = None
-    VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
-    VLLM_TORCH_PROFILER_DIR: Optional[str] = None
+    VLLM_PLUGINS: list[str] | None = None
+    VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None
+    VLLM_TORCH_PROFILER_DIR: str | None = None
     VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
     VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
     VLLM_USE_AOT_COMPILE: bool = False
@@ -126,7 +127,7 @@ if TYPE_CHECKING:
     VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32
     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
     VLLM_RAY_BUNDLE_INDICES: str = ""
-    VLLM_CUDART_SO_PATH: Optional[str] = None
+    VLLM_CUDART_SO_PATH: str | None = None
     VLLM_DP_RANK: int = 0
     VLLM_DP_RANK_LOCAL: int = -1
     VLLM_DP_SIZE: int = 1
@@ -137,11 +138,11 @@ if TYPE_CHECKING:
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
     VLLM_RAY_DP_PACK_STRATEGY: str = "strict"
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
-    VLLM_MXFP4_USE_MARLIN: Optional[bool] = None
+    VLLM_MXFP4_USE_MARLIN: bool | None = None
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
     VLLM_V1_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
-    VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
+    VLLM_TPU_MOST_MODEL_LEN: int | None = None
     VLLM_TPU_USING_PATHWAYS: bool = False
     VLLM_USE_DEEP_GEMM: bool = True
     VLLM_USE_DEEP_GEMM_E8M0: bool = True
@@ -169,21 +170,21 @@ if TYPE_CHECKING:
     VLLM_SLEEP_WHEN_IDLE: bool = False
     VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
     VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
-    VLLM_KV_CACHE_LAYOUT: Optional[Literal["NHD", "HND"]] = None
+    VLLM_KV_CACHE_LAYOUT: Literal["NHD", "HND"] | None = None
     VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
     VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
     VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: Literal[
         "FP", "INT8", "INT6", "INT4", "NONE"
     ] = "NONE"
     VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
-    VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
+    VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: int | None = None
     VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 480
     VLLM_USE_CUDNN_PREFILL: bool = False
     VLLM_ENABLE_CUDAGRAPH_GC: bool = False
     VLLM_LOOPBACK_IP: str = ""
     VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
     VLLM_ENABLE_RESPONSES_API_STORE: bool = False
-    VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
+    VLLM_USE_TRTLLM_ATTENTION: str | None = None
     VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION: bool = False
     VLLM_HAS_FLASHINFER_CUBIN: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
@@ -191,7 +192,7 @@ if TYPE_CHECKING:
     VLLM_ROCM_FP8_MFMA_PAGE_ATTN: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS: bool = False
     VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True
-    VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
+    VLLM_TUNED_CONFIG_FOLDER: str | None = None
     VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False
     VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
     VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
@@ -201,12 +202,12 @@ if TYPE_CHECKING:
     VLLM_DEEPEP_BUFFER_SIZE_MB: int = 1024
     VLLM_DBO_COMM_SMS: int = 20
     GPT_OSS_SYSTEM_TOOL_MCP_LABELS: list[str] = []
-    VLLM_PATTERN_MATCH_DEBUG: Optional[str] = None
-    VLLM_DEBUG_DUMP_PATH: Optional[str] = None
+    VLLM_PATTERN_MATCH_DEBUG: str | None = None
+    VLLM_DEBUG_DUMP_PATH: str | None = None
     VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE: bool = True
     VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING: bool = True
     VLLM_USE_NCCL_SYMM_MEM: bool = False
-    VLLM_NCCL_INCLUDE_PATH: Optional[str] = None
+    VLLM_NCCL_INCLUDE_PATH: str | None = None
     VLLM_USE_FBGEMM: bool = False
     VLLM_GC_DEBUG: str = ""
 
@@ -225,13 +226,13 @@ def get_default_config_root():
     )
 
 
-def maybe_convert_int(value: Optional[str]) -> Optional[int]:
+def maybe_convert_int(value: str | None) -> int | None:
     if value is None:
         return None
     return int(value)
 
 
-def maybe_convert_bool(value: Optional[str]) -> Optional[bool]:
+def maybe_convert_bool(value: str | None) -> bool | None:
     if value is None:
         return None
     return bool(int(value))
@@ -246,10 +247,10 @@ def use_aot_compile() -> bool:
 
 def env_with_choices(
     env_name: str,
-    default: Optional[str],
-    choices: Union[list[str], Callable[[], list[str]]],
+    default: str | None,
+    choices: list[str] | Callable[[], list[str]],
     case_sensitive: bool = True,
-) -> Callable[[], Optional[str]]:
+) -> Callable[[], str | None]:
     """
     Create a lambda that validates environment variable against allowed choices
 
@@ -263,7 +264,7 @@ def env_with_choices(
         Lambda function for environment_variables dict
     """
 
-    def _get_validated_env() -> Optional[str]:
+    def _get_validated_env() -> str | None:
         value = os.getenv(env_name)
         if value is None:
             return default
@@ -292,7 +293,7 @@ def env_with_choices(
 def env_list_with_choices(
     env_name: str,
     default: list[str],
-    choices: Union[list[str], Callable[[], list[str]]],
+    choices: list[str] | Callable[[], list[str]],
     case_sensitive: bool = True,
 ) -> Callable[[], list[str]]:
     """
@@ -344,7 +345,7 @@ def env_list_with_choices(
     return _get_validated_env_list
 
 
-def get_vllm_port() -> Optional[int]:
+def get_vllm_port() -> int | None:
     """Get the port from VLLM_PORT environment variable.
 
     Returns:
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 7bdef5cbe748c..2c44422ba2178 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -4,9 +4,9 @@
 import asyncio
 import time
 from abc import ABC, abstractmethod
-from collections.abc import Awaitable
+from collections.abc import Awaitable, Callable
 from functools import cached_property
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 from typing_extensions import TypeVar
 
@@ -63,10 +63,10 @@ class ExecutorBase(ABC):
     @abstractmethod
     def collective_rpc(
         self,
-        method: Union[str, Callable[[WorkerBase], _R]],
-        timeout: Optional[float] = None,
+        method: str | Callable[[WorkerBase], _R],
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict[str, Any]] = None,
+        kwargs: dict[str, Any] | None = None,
     ) -> list[_R]:
         """
         Execute an RPC call on all workers.
@@ -143,7 +143,7 @@ class ExecutorBase(ABC):
 
     def execute_model(
         self, execute_model_req: ExecuteModelRequest
-    ) -> Optional[list[Union[SamplerOutput, PoolerOutput]]]:
+    ) -> list[SamplerOutput | PoolerOutput] | None:
         output = self.collective_rpc("execute_model", args=(execute_model_req,))
         return output[0]
 
@@ -192,7 +192,7 @@ class ExecutorBase(ABC):
             "It took %.6f seconds to fall asleep.", time_after_sleep - time_before_sleep
         )
 
-    def wake_up(self, tags: Optional[list[str]] = None):
+    def wake_up(self, tags: list[str] | None = None):
         if not self.is_sleeping:
             logger.warning("Executor is not sleeping.")
             return
@@ -222,8 +222,8 @@ class ExecutorBase(ABC):
     def save_sharded_state(
         self,
         path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
+        pattern: str | None = None,
+        max_size: int | None = None,
     ) -> None:
         self.collective_rpc(
             "save_sharded_state",
@@ -256,7 +256,7 @@ class ExecutorBase(ABC):
         exception."""
         self.check_health()
 
-    def init_kv_output_aggregator(self, finished_count: Optional[int]) -> None:
+    def init_kv_output_aggregator(self, finished_count: int | None) -> None:
         """Init KVOutputAggregator"""
         self.kv_output_aggregator = KVOutputAggregator(
             finished_count or self.parallel_config.world_size
@@ -269,7 +269,7 @@ class DistributedExecutorBase(ExecutorBase):
     def __init__(self, *args, **kwargs):
         # This is non-None when the execute model loop is running
         # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
-        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
+        self.parallel_worker_tasks: Any | Awaitable[Any] | None = None
 
         super().__init__(*args, **kwargs)
 
@@ -302,8 +302,8 @@ class DistributedExecutorBase(ExecutorBase):
 
     @abstractmethod
     def _driver_execute_model(
-        self, execute_model_req: Optional[ExecuteModelRequest]
-    ) -> Optional[list[SamplerOutput]]:
+        self, execute_model_req: ExecuteModelRequest | None
+    ) -> list[SamplerOutput] | None:
         """Run execute_model in the driver worker.
 
         Passing None will cause the driver to stop the model execution loop
@@ -314,20 +314,20 @@ class DistributedExecutorBase(ExecutorBase):
 
     def collective_rpc(
         self,
-        method: Union[str, Callable],
-        timeout: Optional[float] = None,
+        method: str | Callable,
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict[str, Any]] = None,
+        kwargs: dict[str, Any] | None = None,
     ) -> list[Any]:
         return self._run_workers(method, *args, **(kwargs or {}))
 
     @abstractmethod
     def _run_workers(
         self,
-        method: Union[str, Callable],
+        method: str | Callable,
         *args,
         async_run_tensor_parallel_workers_only: bool = False,
-        max_concurrent_workers: Optional[int] = None,
+        max_concurrent_workers: int | None = None,
         **kwargs,
     ) -> Any:
         """Runs the given method on all workers.
@@ -374,7 +374,7 @@ class DistributedExecutorBase(ExecutorBase):
     @abstractmethod
     async def _driver_execute_model_async(
         self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
+        execute_model_req: ExecuteModelRequest | None = None,
     ) -> list[SamplerOutput]:
         """Execute the model asynchronously in the driver worker.
 
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 6a9608d70b69d..943c6a27f1e8f 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -4,8 +4,9 @@
 import asyncio
 import os
 from collections import defaultdict
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any
 
 import cloudpickle
 import msgspec
@@ -71,7 +72,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
     uses_ray: bool = True
 
     def _init_executor(self) -> None:
-        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
+        self.forward_dag: ray.dag.CompiledDAG | None = None
         if envs.VLLM_USE_V1:
             # V1 uses SPMD worker and compiled DAG
             os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1"
@@ -114,10 +115,10 @@ class RayDistributedExecutor(DistributedExecutorBase):
         self._init_workers_ray(placement_group)
 
         self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
-        self.output_decoder = msgspec.msgpack.Decoder(Optional[list[SamplerOutput]])
+        self.output_decoder = msgspec.msgpack.Decoder(list[SamplerOutput] | None)
         self.use_v1 = envs.VLLM_USE_V1
 
-        self.pp_locks: Optional[list[asyncio.Lock]] = None
+        self.pp_locks: list[asyncio.Lock] | None = None
         if not self.use_ray_compiled_dag:
             self.driver_exec_method = make_async(self.driver_worker.execute_method)
 
@@ -162,7 +163,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
 
         # The driver dummy worker does not actually use any resources.
         # It holds the resource for the driver worker.
-        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
+        self.driver_dummy_worker: RayWorkerWrapper | None = None
         # The remaining workers are the actual ray actors.
         self.workers: list[RayWorkerWrapper] = []
 
@@ -432,8 +433,8 @@ class RayDistributedExecutor(DistributedExecutorBase):
                 self.non_driver_workers.append(worker)
 
     def _driver_execute_model(
-        self, execute_model_req: Optional[ExecuteModelRequest]
-    ) -> Optional[list[SamplerOutput]]:
+        self, execute_model_req: ExecuteModelRequest | None
+    ) -> list[SamplerOutput] | None:
         """Run execute_model in the driver worker.
 
         Passing None will cause the driver to stop the model execution
@@ -463,10 +464,10 @@ class RayDistributedExecutor(DistributedExecutorBase):
 
     def _run_workers(
         self,
-        method: Union[str, Callable],
+        method: str | Callable,
         *args,
         async_run_tensor_parallel_workers_only: bool = False,
-        max_concurrent_workers: Optional[int] = None,
+        max_concurrent_workers: int | None = None,
         **kwargs,
     ) -> Any:
         """Runs the given method on all workers. Can be used in the following
@@ -683,7 +684,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
         return self.output_decoder.decode(output)
 
     async def _driver_execute_model_async(
-        self, execute_model_req: Optional[ExecuteModelRequest] = None
+        self, execute_model_req: ExecuteModelRequest | None = None
     ) -> list[SamplerOutput]:
         assert not self.use_ray_spmd_worker, (
             "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1"
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index c3c8a70678add..d12151bb9485a 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -4,7 +4,7 @@
 import os
 import time
 from collections import defaultdict
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Union
 
 import msgspec
 
@@ -72,7 +72,7 @@ try:
 
         def execute_model_spmd(
             self,
-            req_or_tuple: Union[bytes, tuple[bytes, Optional[IntermediateTensors]]],
+            req_or_tuple: bytes | tuple[bytes, IntermediateTensors | None],
         ) -> bytes:
             """Execute model in SPMD fashion: used only when SPMD worker and
             compiled DAG are both enabled.
@@ -300,7 +300,7 @@ def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
 
 def initialize_ray_cluster(
     parallel_config: ParallelConfig,
-    ray_address: Optional[str] = None,
+    ray_address: str | None = None,
 ):
     """Initialize the distributed cluster with Ray.
 
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index 612fd73c12b15..c6fa279e05686 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
+from collections.abc import Callable
 from concurrent.futures import Future, ThreadPoolExecutor
 from functools import cached_property
 from multiprocessing import Lock
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -36,7 +37,7 @@ class UniProcExecutor(ExecutorBase):
             shared_worker_lock=Lock(),
         )
 
-        self.async_output_thread: Optional[ThreadPoolExecutor] = None
+        self.async_output_thread: ThreadPoolExecutor | None = None
         if self.max_concurrent_batches > 1:
             self.async_output_thread = ThreadPoolExecutor(
                 max_workers=1, thread_name_prefix="WorkerAsyncOutput"
@@ -60,10 +61,10 @@ class UniProcExecutor(ExecutorBase):
 
     def collective_rpc(
         self,
-        method: Union[str, Callable],
-        timeout: Optional[float] = None,
+        method: str | Callable,
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict] = None,
+        kwargs: dict | None = None,
         non_block: bool = False,
     ) -> list[Any]:
         if kwargs is None:
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 36f3062a9e3a0..484de15040c21 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,7 +5,7 @@ import time
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
+from typing import TYPE_CHECKING, Any, NamedTuple, Union
 
 import torch
 
@@ -84,7 +84,7 @@ class DPMetadata:
     num_tokens_across_dp_cpu: torch.Tensor
 
     # NOTE: local_sizes should only be set by the chunked_sizes context manager
-    local_sizes: Optional[list[int]] = None
+    local_sizes: list[int] | None = None
 
     @staticmethod
     def make(
@@ -158,7 +158,7 @@ class DPMetadata:
         finally:
             self.local_sizes = None
 
-    def get_chunk_sizes_across_dp_rank(self) -> Optional[list[int]]:
+    def get_chunk_sizes_across_dp_rank(self) -> list[int] | None:
         assert self.local_sizes is not None
         return self.local_sizes
 
@@ -194,13 +194,13 @@ class ForwardContext:
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
-    dp_metadata: Optional[DPMetadata] = None
+    dp_metadata: DPMetadata | None = None
     # determine the cudagraph style at runtime to be FULL, PIECEWISE, or NONE.
     # by default NONE, no cudagraph is used.
     cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE
-    batch_descriptor: Optional[BatchDescriptor] = None
+    batch_descriptor: BatchDescriptor | None = None
 
-    ubatch_slices: Optional[UBatchSlices] = None
+    ubatch_slices: UBatchSlices | None = None
 
     def __post_init__(self):
         assert self.cudagraph_runtime_mode.valid_runtime_modes(), (
@@ -208,7 +208,7 @@ class ForwardContext:
         )
 
 
-_forward_context: Optional[ForwardContext] = None
+_forward_context: ForwardContext | None = None
 
 
 def get_forward_context() -> ForwardContext:
@@ -224,10 +224,10 @@ def create_forward_context(
     attn_metadata: Any,
     vllm_config: VllmConfig,
     virtual_engine: int = 0,
-    dp_metadata: Optional[DPMetadata] = None,
+    dp_metadata: DPMetadata | None = None,
     cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
-    batch_descriptor: Optional[BatchDescriptor] = None,
-    ubatch_slices: Optional[UBatchSlices] = None,
+    batch_descriptor: BatchDescriptor | None = None,
+    ubatch_slices: UBatchSlices | None = None,
 ):
     return ForwardContext(
         no_compile_layers=vllm_config.compilation_config.static_forward_context,
@@ -241,7 +241,7 @@ def create_forward_context(
 
 
 @contextmanager
-def override_forward_context(forward_context: Optional[ForwardContext]):
+def override_forward_context(forward_context: ForwardContext | None):
     """A context manager that overrides the current forward context.
     This is used to override the forward context for a specific
     forward pass.
@@ -260,11 +260,11 @@ def set_forward_context(
     attn_metadata: Any,
     vllm_config: VllmConfig,
     virtual_engine: int = 0,
-    num_tokens: Optional[int] = None,
-    num_tokens_across_dp: Optional[torch.Tensor] = None,
+    num_tokens: int | None = None,
+    num_tokens_across_dp: torch.Tensor | None = None,
     cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
-    batch_descriptor: Optional[BatchDescriptor] = None,
-    ubatch_slices: Optional[UBatchSlices] = None,
+    batch_descriptor: BatchDescriptor | None = None,
+    ubatch_slices: UBatchSlices | None = None,
 ):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
@@ -275,7 +275,7 @@ def set_forward_context(
     if need_to_track_batchsize:
         forward_start_time = time.perf_counter()
 
-    dp_metadata: Optional[DPMetadata] = None
+    dp_metadata: DPMetadata | None = None
     if vllm_config.parallel_config.data_parallel_size > 1 and (
         attn_metadata is not None or num_tokens is not None
     ):
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index be14decb4ac9d..5a8304ac05a67 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, cast
 
 import torch
 from typing_extensions import NotRequired, TypedDict, TypeIs, TypeVar
@@ -12,6 +12,10 @@ if TYPE_CHECKING:
         MultiModalInputs,
         MultiModalUUIDDict,
     )
+else:
+    MultiModalDataDict = object
+    MultiModalInputs = object
+    MultiModalUUIDDict = object
 
 
 class TextPrompt(TypedDict):
@@ -20,13 +24,13 @@ class TextPrompt(TypedDict):
     prompt: str
     """The input text to be tokenized before passing to the model."""
 
-    multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
+    multi_modal_data: NotRequired[MultiModalDataDict | None]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
     """
 
-    mm_processor_kwargs: NotRequired[Optional[dict[str, Any]]]
+    mm_processor_kwargs: NotRequired[dict[str, Any] | None]
     """
     Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
@@ -34,7 +38,7 @@ class TextPrompt(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
-    multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
+    multi_modal_uuids: NotRequired[MultiModalUUIDDict]
     """
     Optional user-specified UUIDs for multimodal items, mapped by modality.
     Lists must match the number of items per modality and may contain `None`.
@@ -61,13 +65,13 @@ class TokensPrompt(TypedDict):
     token_type_ids: NotRequired[list[int]]
     """A list of token type IDs to pass to the cross encoder model."""
 
-    multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
+    multi_modal_data: NotRequired[MultiModalDataDict | None]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
     """
 
-    mm_processor_kwargs: NotRequired[Optional[dict[str, Any]]]
+    mm_processor_kwargs: NotRequired[dict[str, Any] | None]
     """
     Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
@@ -75,7 +79,7 @@ class TokensPrompt(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
-    multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
+    multi_modal_uuids: NotRequired[MultiModalUUIDDict]
     """
     Optional user-specified UUIDs for multimodal items, mapped by modality.
     Lists must match the number of items per modality and may contain `None`.
@@ -111,7 +115,7 @@ class DataPrompt(TypedDict):
     """The input data format"""
 
 
-SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
+SingletonPrompt: TypeAlias = str | TextPrompt | TokensPrompt | EmbedsPrompt
 """
 Set of possible schemas for a single prompt:
 
@@ -185,12 +189,12 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
 
     encoder_prompt: _T1_co
 
-    decoder_prompt: Optional[_T2_co]
+    decoder_prompt: _T2_co | None
 
     mm_processor_kwargs: NotRequired[dict[str, Any]]
 
 
-PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
+PromptType: TypeAlias = SingletonPrompt | ExplicitEncoderDecoderPrompt
 """
 Set of possible schemas for an LLM input, including
 both decoder-only and encoder/decoder input types:
@@ -220,7 +224,7 @@ class TokenInputs(TypedDict):
 
 def token_inputs(
     prompt_token_ids: list[int],
-    cache_salt: Optional[str] = None,
+    cache_salt: str | None = None,
 ) -> TokenInputs:
     """Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
     values."""
@@ -249,7 +253,7 @@ class EmbedsInputs(TypedDict):
 
 def embeds_inputs(
     prompt_embeds: torch.Tensor,
-    cache_salt: Optional[str] = None,
+    cache_salt: str | None = None,
 ) -> EmbedsInputs:
     """Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional
     values."""
@@ -261,7 +265,7 @@ def embeds_inputs(
     return inputs
 
 
-DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
+DecoderOnlyInputs: TypeAlias = TokenInputs | EmbedsInputs | MultiModalInputs
 """
 The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they are
 passed to the model executor.
@@ -277,20 +281,20 @@ class EncoderDecoderInputs(TypedDict):
     This specifies the required data for encoder-decoder models.
     """
 
-    encoder: Union[TokenInputs, "MultiModalInputs"]
+    encoder: TokenInputs | MultiModalInputs
     """The inputs for the encoder portion."""
 
-    decoder: Union[TokenInputs, "MultiModalInputs"]
+    decoder: TokenInputs | MultiModalInputs
     """The inputs for the decoder portion."""
 
 
-SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
+SingletonInputs: TypeAlias = TokenInputs | EmbedsInputs | MultiModalInputs
 """
 A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be
 passed to [`Sequence`][collections.abc.Sequence].
 """
 
-ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
+ProcessorInputs: TypeAlias = DecoderOnlyInputs | EncoderDecoderInputs
 """
 The outputs from [`vllm.inputs.preprocess.InputPreprocessor`][].
 """
@@ -301,8 +305,8 @@ _T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt)
 
 def build_explicit_enc_dec_prompt(
     encoder_prompt: _T1,
-    decoder_prompt: Optional[_T2],
-    mm_processor_kwargs: Optional[dict[str, Any]] = None,
+    decoder_prompt: _T2 | None,
+    mm_processor_kwargs: dict[str, Any] | None = None,
 ) -> ExplicitEncoderDecoderPrompt[_T1, _T2]:
     if mm_processor_kwargs is None:
         mm_processor_kwargs = {}
@@ -315,10 +319,8 @@ def build_explicit_enc_dec_prompt(
 
 def zip_enc_dec_prompts(
     enc_prompts: Iterable[_T1],
-    dec_prompts: Iterable[Optional[_T2]],
-    mm_processor_kwargs: Optional[
-        Union[Iterable[dict[str, Any]], dict[str, Any]]
-    ] = None,
+    dec_prompts: Iterable[_T2 | None],
+    mm_processor_kwargs: Iterable[dict[str, Any]] | dict[str, Any] | None = None,
 ) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
     """
     Zip encoder and decoder prompts together into a list of
@@ -350,7 +352,7 @@ def zip_enc_dec_prompts(
 
 def to_enc_dec_tuple_list(
     enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]],
-) -> list[tuple[_T1, Optional[_T2]]]:
+) -> list[tuple[_T1, _T2 | None]]:
     return [
         (enc_dec_prompt["encoder_prompt"], enc_dec_prompt["decoder_prompt"])
         for enc_dec_prompt in enc_dec_prompts
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 2f7bd50df022e..5cfef7f5b6d95 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Literal, NamedTuple, Optional, TypedDict, Union, cast
+from typing import TYPE_CHECKING, Literal, NamedTuple, TypeAlias, TypedDict, cast
 
 from typing_extensions import TypeIs
 
@@ -23,8 +23,8 @@ if TYPE_CHECKING:
 
 
 def parse_raw_prompts(
-    prompt: Union[str, list[str], list[int], list[list[int]]],
-) -> Union[Sequence[TextPrompt], Sequence[TokensPrompt]]:
+    prompt: str | list[str] | list[int] | list[list[int]],
+) -> Sequence[TextPrompt] | Sequence[TokensPrompt]:
     if isinstance(prompt, str):
         # case 1: a string
         return [TextPrompt(prompt=prompt)]
@@ -76,9 +76,9 @@ class ParsedEmbedsPrompt(TypedDict):
     content: EmbedsPrompt
 
 
-ParsedSingletonPrompt = Union[
-    ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt, ParsedEmbedsPrompt
-]
+ParsedSingletonPrompt: TypeAlias = (
+    ParsedStrPrompt | ParsedTextPrompt | ParsedTokensPrompt | ParsedEmbedsPrompt
+)
 
 
 def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt:
@@ -106,7 +106,7 @@ def is_explicit_encoder_decoder_prompt(
 
 def split_enc_dec_inputs(
     inputs: ProcessorInputs,
-) -> tuple[Optional[SingletonInputs], SingletonInputs]:
+) -> tuple[SingletonInputs | None, SingletonInputs]:
     if "encoder" in inputs and "decoder" in inputs:
         # NOTE: This passes pyright but not mypy
         return (
@@ -118,9 +118,9 @@ def split_enc_dec_inputs(
 
 
 class PromptComponents(NamedTuple):
-    text: Optional[str] = None
-    token_ids: Optional[list[int]] = None
-    embeds: Optional["torch.Tensor"] = None
+    text: str | None = None
+    token_ids: list[int] | None = None
+    embeds: "torch.Tensor | None" = None
 
 
 def get_prompt_components(prompt: PromptType) -> PromptComponents:
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 809f6c8d83f01..80d5322a34c3a 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping
-from typing import Any, Optional, Union, cast
+from typing import Any, cast
 
 from typing_extensions import assert_never
 
@@ -46,9 +46,9 @@ class InputPreprocessor:
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: Optional[AnyTokenizer],
+        tokenizer: AnyTokenizer | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-        mm_processor_cache: Optional[BaseMultiModalProcessorCache] = None,
+        mm_processor_cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
         super().__init__()
 
@@ -67,7 +67,7 @@ class InputPreprocessor:
 
         return self.tokenizer
 
-    def get_bos_token_id(self) -> Optional[int]:
+    def get_bos_token_id(self) -> int | None:
         if self.tokenizer is None:
             logger.warning_once(
                 "Using None for BOS token id because tokenizer is not initialized"
@@ -76,7 +76,7 @@ class InputPreprocessor:
 
         return self.tokenizer.bos_token_id
 
-    def get_eos_token_id(self) -> Optional[int]:
+    def get_eos_token_id(self) -> int | None:
         if self.tokenizer is None:
             logger.warning_once(
                 "Using None for EOS token id because tokenizer is not initialized"
@@ -85,7 +85,7 @@ class InputPreprocessor:
 
         return self.tokenizer.eos_token_id
 
-    def get_decoder_start_token_id(self) -> Optional[int]:
+    def get_decoder_start_token_id(self) -> int | None:
         """
         Obtain the decoder start token id employed by an encoder/decoder
         model. Returns None for non-encoder/decoder models or if the
@@ -157,7 +157,7 @@ class InputPreprocessor:
 
     def _prepare_decoder_input_ids_for_generation(
         self,
-        decoder_input_ids: Optional[list[int]],
+        decoder_input_ids: list[int] | None,
     ) -> list[int]:
         """
         Prepares `decoder_input_ids` for generation with encoder-decoder models.
@@ -194,7 +194,7 @@ class InputPreprocessor:
 
     def _get_tokenization_kw(
         self,
-        overrides: Optional[dict[str, Any]] = None,
+        overrides: dict[str, Any] | None = None,
     ) -> dict[str, Any]:
         kwargs = dict[str, Any]()
 
@@ -212,7 +212,7 @@ class InputPreprocessor:
     def _tokenize_prompt(
         self,
         prompt: str,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> list[int]:
         """
         Apply the model's tokenizer to a text prompt, returning the
@@ -251,12 +251,12 @@ class InputPreprocessor:
 
     def _process_multimodal(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
-        mm_processor_kwargs: Optional[Mapping[str, object]],
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        mm_processor_kwargs: Mapping[str, object] | None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
@@ -320,7 +320,7 @@ class InputPreprocessor:
         )
 
     def _truncate_inputs(
-        self, inputs: list[int], tokenization_kwargs: Optional[dict[str, Any]] = None
+        self, inputs: list[int], tokenization_kwargs: dict[str, Any] | None = None
     ) -> list[int]:
         if (
             not tokenization_kwargs
@@ -339,15 +339,15 @@ class InputPreprocessor:
     def _process_tokens(
         self,
         parsed_content: TokensPrompt,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
-    ) -> Union[TokenInputs, MultiModalInputs]:
+        mm_uuids: MultiModalUUIDDict | None = None,
+    ) -> TokenInputs | MultiModalInputs:
         prompt_token_ids = self._truncate_inputs(
             parsed_content["prompt_token_ids"], tokenization_kwargs
         )
 
-        inputs: Union[TokenInputs, MultiModalInputs]
+        inputs: TokenInputs | MultiModalInputs
         if self.model_config.is_multimodal_model:
             inputs = self._process_multimodal(
                 prompt_token_ids,
@@ -370,13 +370,13 @@ class InputPreprocessor:
     def _process_text(
         self,
         parsed_content: TextPrompt,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
-    ) -> Union[TokenInputs, MultiModalInputs]:
+        mm_uuids: MultiModalUUIDDict | None = None,
+    ) -> TokenInputs | MultiModalInputs:
         prompt_text = parsed_content["prompt"]
 
-        inputs: Union[TokenInputs, MultiModalInputs]
+        inputs: TokenInputs | MultiModalInputs
         if self.model_config.is_multimodal_model:
             inputs = self._process_multimodal(
                 prompt_text,
@@ -403,9 +403,9 @@ class InputPreprocessor:
     def _prompt_to_llm_inputs(
         self,
         prompt: SingletonPrompt,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> SingletonInputs:
         """
         Extract the singleton inputs from a prompt.
@@ -445,7 +445,7 @@ class InputPreprocessor:
     def _build_enc_dec_llm_inputs(
         self,
         encoder_inputs: SingletonInputs,
-        decoder_inputs: Optional[SingletonInputs],
+        decoder_inputs: SingletonInputs | None,
     ) -> EncoderDecoderInputs:
         if (
             encoder_inputs["type"] == "embeds"
@@ -457,10 +457,8 @@ class InputPreprocessor:
             )
 
         # Needed for mypy
-        encoder_inputs = cast(Union[TokenInputs, MultiModalInputs], encoder_inputs)
-        decoder_inputs = cast(
-            Optional[Union[TokenInputs, MultiModalInputs]], decoder_inputs
-        )
+        encoder_inputs = cast(TokenInputs | MultiModalInputs, encoder_inputs)
+        decoder_inputs = cast(TokenInputs | MultiModalInputs | None, decoder_inputs)
 
         if decoder_inputs is None:
             if self.model_config.hf_config.model_type == "whisper":
@@ -491,8 +489,8 @@ class InputPreprocessor:
 
     def _split_enc_dec_mm_inputs(
         self,
-        inputs: Union[SingletonInputs, MultiModalEncDecInputs],
-        decoder_inputs_to_override: Optional[SingletonInputs] = None,
+        inputs: SingletonInputs | MultiModalEncDecInputs,
+        decoder_inputs_to_override: SingletonInputs | None = None,
     ) -> tuple[SingletonInputs, SingletonInputs]:
         """
         For encoder/decoder models only:
@@ -509,11 +507,11 @@ class InputPreprocessor:
 
         # Needed for mypy
         inputs = cast(
-            Union[TokenInputs, MultiModalInputs, MultiModalEncDecInputs],
+            TokenInputs | MultiModalInputs | MultiModalEncDecInputs,
             inputs,
         )
         decoder_inputs_to_override = cast(
-            Optional[Union[TokenInputs, MultiModalInputs]],
+            TokenInputs | MultiModalInputs | None,
             decoder_inputs_to_override,
         )
 
@@ -553,9 +551,9 @@ class InputPreprocessor:
     def _process_encoder_decoder_prompt(
         self,
         prompt: PromptType,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> EncoderDecoderInputs:
         """
         For encoder/decoder models only:
@@ -591,7 +589,7 @@ class InputPreprocessor:
           instance
         """
         encoder_inputs: SingletonInputs
-        decoder_inputs: Optional[SingletonInputs]
+        decoder_inputs: SingletonInputs | None
 
         if is_explicit_encoder_decoder_prompt(prompt):
             # `cast` is needed for mypy, but not pyright
@@ -633,7 +631,7 @@ class InputPreprocessor:
     ) -> DecoderOnlyInputs:
         if "prompt_token_ids" in prompt_inputs:
             prompt_inputs = cast(
-                Union[TokenInputs, MultiModalInputs], prompt_inputs
+                TokenInputs | MultiModalInputs, prompt_inputs
             )  # Needed for mypy
 
         return prompt_inputs
@@ -641,9 +639,9 @@ class InputPreprocessor:
     def _process_decoder_only_prompt(
         self,
         prompt: SingletonPrompt,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
@@ -670,9 +668,9 @@ class InputPreprocessor:
     def _preprocess(
         self,
         prompt: PromptType,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> ProcessorInputs:
         if self.model_config.is_encoder_decoder:
             # Encoder-decoder model requires special mapping of
@@ -699,9 +697,9 @@ class InputPreprocessor:
     def preprocess(
         self,
         prompt: PromptType,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
         res = self._preprocess(
@@ -718,7 +716,7 @@ class InputPreprocessor:
 
         return res
 
-    def stat_mm_cache(self) -> Optional[MultiModalCacheStats]:
+    def stat_mm_cache(self) -> MultiModalCacheStats | None:
         mm_cache_stats = self.mm_cache_stats
         if mm_cache_stats is None:
             return None
diff --git a/vllm/logger.py b/vllm/logger.py
index 37e8495768c04..1e53ee796ca14 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -13,7 +13,7 @@ from logging import Logger
 from logging.config import dictConfig
 from os import path
 from types import MethodType
-from typing import Any, Optional, cast
+from typing import Any, cast
 
 import vllm.envs as envs
 
@@ -217,7 +217,7 @@ def _trace_calls(log_path, root_dir, frame, event, arg=None):
     return partial(_trace_calls, log_path, root_dir)
 
 
-def enable_trace_function_call(log_file_path: str, root_dir: Optional[str] = None):
+def enable_trace_function_call(log_file_path: str, root_dir: str | None = None):
     """
     Enable tracing of every function call in code under `root_dir`.
     This is useful for debugging hangs or crashes.
diff --git a/vllm/logging_utils/dump_input.py b/vllm/logging_utils/dump_input.py
index 3a97000647d60..cb289d04e3f40 100644
--- a/vllm/logging_utils/dump_input.py
+++ b/vllm/logging_utils/dump_input.py
@@ -4,7 +4,6 @@
 import contextlib
 import enum
 import json
-from typing import Optional
 
 import torch
 
@@ -57,7 +56,7 @@ def prepare_object_to_dump(obj) -> str:
 def dump_engine_exception(
     config: VllmConfig,
     scheduler_output: SchedulerOutput,
-    scheduler_stats: Optional[SchedulerStats],
+    scheduler_stats: SchedulerStats | None,
 ):
     # NOTE: ensure we can log extra info without risking raises
     # unexpected errors during logging
@@ -68,7 +67,7 @@ def dump_engine_exception(
 def _dump_engine_exception(
     config: VllmConfig,
     scheduler_output: SchedulerOutput,
-    scheduler_stats: Optional[SchedulerStats],
+    scheduler_stats: SchedulerStats | None,
 ):
     logger.error(
         "Dumping input data for V1 LLM engine (v%s) with config: %s, ",
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index 6ac30ae0028e9..7b6a6528e20e8 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -1,16 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Sequence
-from typing import Callable, Union
+from collections.abc import Callable, Sequence
+from typing import TypeAlias
 
 import torch
 
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
-LogitsProcessor = Union[
-    Callable[[list[int], torch.Tensor], torch.Tensor],
-    Callable[[list[int], list[int], torch.Tensor], torch.Tensor],
-]
+LogitsProcessor: TypeAlias = (
+    Callable[[list[int], torch.Tensor], torch.Tensor]
+    | Callable[[list[int], list[int], torch.Tensor], torch.Tensor]
+)
 """LogitsProcessor is a function that takes a list
 of previously generated tokens, the logits tensor
 for the next token and, optionally, prompt tokens as a
diff --git a/vllm/logprobs.py b/vllm/logprobs.py
index 2458e43c690f6..21c886e0ad5eb 100644
--- a/vllm/logprobs.py
+++ b/vllm/logprobs.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
 
 
 # We use dataclass for now because it is used for
@@ -18,12 +17,12 @@ class Logprob:
     """
 
     logprob: float
-    rank: Optional[int] = None
-    decoded_token: Optional[str] = None
+    rank: int | None = None
+    decoded_token: str | None = None
 
 
 # {token_id -> logprob} per each sequence group. None if the corresponding
 # sequence group doesn't require prompt logprob.
-PromptLogprobs = list[Optional[dict[int, Logprob]]]
+PromptLogprobs = list[dict[int, Logprob] | None]
 # {token_id -> logprob} for each sequence group.
 SampleLogprobs = list[dict[int, Logprob]]
diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py
index 5279247a17594..0c7e806848892 100644
--- a/vllm/lora/layers/base.py
+++ b/vllm/lora/layers/base.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 import torch
 import torch.nn as nn
@@ -15,14 +15,14 @@ if TYPE_CHECKING:
 
 class BaseLayerWithLoRA(nn.Module):
     def slice_lora_a(
-        self, lora_a: Union[torch.Tensor, list[Union[torch.Tensor, None]]]
-    ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]:
+        self, lora_a: torch.Tensor | list[torch.Tensor | None]
+    ) -> torch.Tensor | list[torch.Tensor | None]:
         """Slice lora a if splitting for tensor parallelism."""
         ...
 
     def slice_lora_b(
-        self, lora_b: Union[torch.Tensor, list[Union[torch.Tensor, None]]]
-    ) -> Union[torch.Tensor, list[Union[torch.Tensor, None]]]:
+        self, lora_b: torch.Tensor | list[torch.Tensor | None]
+    ) -> torch.Tensor | list[torch.Tensor | None]:
         """Slice lora b if splitting with tensor parallelism."""
         ...
 
@@ -30,7 +30,7 @@ class BaseLayerWithLoRA(nn.Module):
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
+        model_config: PretrainedConfig | None = None,
     ) -> None:
         """Initializes lora matrices."""
         ...
@@ -44,7 +44,7 @@ class BaseLayerWithLoRA(nn.Module):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
+        embeddings_tensor: torch.Tensor | None,
     ):
         """Overwrites lora tensors at index."""
         ...
@@ -61,7 +61,7 @@ class BaseLayerWithLoRA(nn.Module):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
         raise NotImplementedError
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index da053f0923aba..d619a0edc1241 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 from transformers import PretrainedConfig
@@ -37,7 +36,7 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
+        model_config: PretrainedConfig | None = None,
     ) -> None:
         self.lora_config = lora_config
         #
@@ -97,7 +96,7 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
+        embeddings_tensor: torch.Tensor | None,
     ):
         # Except for QKVParallelLinearWithLoRA and
         # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
@@ -119,9 +118,7 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
             lora_b, non_blocking=True
         )
 
-    def apply(
-        self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
         # In transformers backend, x and output have extra batch dimension like
@@ -131,7 +128,7 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
             output = output.flatten(0, 1)
             x = x.flatten(0, 1)
 
-        lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_lora_linear(
+        lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_linear(
             output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, self.output_slices
         )
         if not current_platform.can_update_inplace():
@@ -160,7 +157,7 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
             raise ValueError(f"Unsupported base layer: {self.base_layer}")
 
     @property
-    def bias(self) -> Optional[torch.Tensor]:
+    def bias(self) -> torch.Tensor | None:
         if hasattr(self.base_layer, "bias"):
             return self.base_layer.bias
         else:
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index c49b90a80ceac..637ded9b2a0f0 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -46,7 +45,7 @@ def _mcp_apply(x, bias, layer: "ColumnParallelLinearWithLoRA"):
         device=x.device,
     )
 
-    shrunk_buffers: Optional[torch.Tensor] = layer.punica_wrapper.add_shrink(
+    shrunk_buffers: torch.Tensor | None = layer.punica_wrapper.add_shrink(
         buffers, x, layer.lora_a_stacked, 1.0
     )
 
@@ -55,7 +54,7 @@ def _mcp_apply(x, bias, layer: "ColumnParallelLinearWithLoRA"):
 
     buffers = tensor_model_parallel_all_gather(buffers)
 
-    lora_output: Optional[torch.Tensor] = layer.punica_wrapper.add_expand(
+    lora_output: torch.Tensor | None = layer.punica_wrapper.add_expand(
         output,
         buffers,
         layer.lora_b_stacked,
@@ -121,7 +120,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
         """Forward of ColumnParallelLinear
 
         Args:
@@ -154,7 +153,7 @@ class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         return type(source_layer) is ColumnParallelLinear or (
             type(source_layer) is MergedColumnParallelLinear
@@ -172,7 +171,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     """
 
     def __init__(
-        self, base_layer: Union[MergedColumnParallelLinear, QKVParallelLinear]
+        self, base_layer: MergedColumnParallelLinear | QKVParallelLinear
     ) -> None:
         super().__init__(base_layer)
         # There are two LoRA layers
@@ -189,7 +188,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
+        model_config: PretrainedConfig | None = None,
     ) -> None:
         """
         The main reason for overriding this function is to enhance  code
@@ -227,13 +226,13 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         )
 
     def slice_lora_a(
-        self, lora_a: list[Union[torch.Tensor, None]]
-    ) -> list[Union[torch.Tensor, None]]:
+        self, lora_a: list[torch.Tensor | None]
+    ) -> list[torch.Tensor | None]:
         return lora_a
 
     def slice_lora_b(
-        self, lora_b: list[Union[torch.Tensor, None]]
-    ) -> list[Union[torch.Tensor, None]]:
+        self, lora_b: list[torch.Tensor | None]
+    ) -> list[torch.Tensor | None]:
         sliced_lora_b = [None] * self.n_slices
         for i, (shard_id, shard_size) in enumerate(
             zip(self.output_ids, self.output_slices)
@@ -249,7 +248,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
+        embeddings_tensor: torch.Tensor | None,
     ):
         self.reset_lora(index)
 
@@ -274,7 +273,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         return (
             type(source_layer) is MergedColumnParallelLinear
@@ -340,7 +339,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1
 
@@ -383,7 +382,7 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
+        model_config: PretrainedConfig | None = None,
     ) -> None:
         """
         The main reason for overloading this function is to handle inconsistent
@@ -398,7 +397,7 @@ class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 3
 
@@ -426,9 +425,7 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
         lora_a = lora_a[start_idx : start_idx + shard_size, :]
         return lora_a
 
-    def apply(
-        self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -438,7 +435,7 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -459,8 +456,8 @@ class MergedColumnParallelLinearWithShardedLoRA(MergedColumnParallelLinearWithLo
     """
 
     def slice_lora_a(
-        self, lora_a: list[Union[torch.Tensor, None]]
-    ) -> list[Union[torch.Tensor, None]]:
+        self, lora_a: list[torch.Tensor | None]
+    ) -> list[torch.Tensor | None]:
         # NOTE: lora_a contains 2 subloras, and each sublora could be None.
         output_shard_size = self.lora_a_stacked[0].shape[2]
         output_start_idx = self.tp_rank * output_shard_size
@@ -474,9 +471,7 @@ class MergedColumnParallelLinearWithShardedLoRA(MergedColumnParallelLinearWithLo
         ]
         return lora_a
 
-    def apply(
-        self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -486,7 +481,7 @@ class MergedColumnParallelLinearWithShardedLoRA(MergedColumnParallelLinearWithLo
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -512,9 +507,7 @@ class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA):
         lora_a = lora_a[start_idx : start_idx + shard_size, :]
         return lora_a
 
-    def apply(
-        self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -524,7 +517,7 @@ class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -545,8 +538,8 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
     """
 
     def slice_lora_a(
-        self, lora_a: list[Union[torch.Tensor, None]]
-    ) -> list[Union[torch.Tensor, None]]:
+        self, lora_a: list[torch.Tensor | None]
+    ) -> list[torch.Tensor | None]:
         # NOTE: lora_a contains 3 subloras, and each sublora could be None.
         shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)]
         start_idx = [self.tp_rank * shard_size[i] for i in range(3)]
@@ -563,9 +556,7 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
         ]
         return lora_a
 
-    def apply(
-        self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -575,7 +566,7 @@ class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
index f3ca60fb28d90..adc5e861f57fb 100644
--- a/vllm/lora/layers/logits_processor.py
+++ b/vllm/lora/layers/logits_processor.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -41,7 +40,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         hidden_size: int,
         dtype: torch.dtype,
         device: torch.device,
-        sharded_to_full_mapping: Optional[list[int]],
+        sharded_to_full_mapping: list[int] | None,
     ) -> None:
         super().__init__()
         self.base_layer = base_layer
@@ -88,7 +87,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
+        model_config: PretrainedConfig | None = None,
     ) -> None:
         # TODO: Verify if this condition can be further relaxed
         if 32000 < self.base_layer.vocab_size > 257024:
@@ -142,7 +141,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
+        embeddings_tensor: torch.Tensor | None,
     ):
         self.reset_lora(index)
         self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
@@ -162,8 +161,8 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         self,
         hidden_states: torch.Tensor,
         lm_head: VocabParallelEmbedding,
-        embedding_bias: Optional[torch.Tensor] = None,
-    ) -> Optional[torch.Tensor]:
+        embedding_bias: torch.Tensor | None = None,
+    ) -> torch.Tensor | None:
         # Get the logits for the next tokens.
         logits = lm_head.quant_method.apply(lm_head, hidden_states)
         if embedding_bias is not None:
@@ -227,7 +226,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
             + lora_logits.shape[1],
         ] = lora_logits
 
-        lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_lora_logits(
+        lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_logits(
             logits, hidden_states, self.lora_a_stacked, self.lora_b_stacked, 1.0
         )
 
@@ -247,7 +246,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         # Special handling for the LogitsProcessor.
         return False
diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py
index 18a35cd1e0f22..5ad4a9f44f407 100644
--- a/vllm/lora/layers/replicated_linear.py
+++ b/vllm/lora/layers/replicated_linear.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -24,7 +23,7 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
         """Forward of ReplicatedLinearWithLoRA
 
         Args:
@@ -54,6 +53,6 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         return type(source_layer) is ReplicatedLinear
diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py
index fff4fb38ead90..2ef1bd98fc612 100644
--- a/vllm/lora/layers/row_parallel_linear.py
+++ b/vllm/lora/layers/row_parallel_linear.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -41,7 +40,7 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
         """Forward of RowParallelLinear
 
         Args:
@@ -93,7 +92,7 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         return type(source_layer) is RowParallelLinear
 
@@ -120,9 +119,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
         lora_b = lora_b[start_idx:end_idx, :]
         return lora_b
 
-    def apply(
-        self, x: torch.Tensor, bias: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
 
         x = x.view(-1, x.shape[-1])
@@ -133,7 +130,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
             device=x.device,
         )
 
-        shrunk_buffer: Optional[torch.Tensor] = self.punica_wrapper.add_shrink(
+        shrunk_buffer: torch.Tensor | None = self.punica_wrapper.add_shrink(
             buffer, x, self.lora_a_stacked, 1.0
         )
         if not current_platform.can_update_inplace():
@@ -150,7 +147,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
         # NOTE offset are based on the rank.
         shard_size = self.lora_b_stacked[0].shape[2]
         offset_start = self.tp_rank * shard_size
-        lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_expand(
+        lora_output: torch.Tensor | None = self.punica_wrapper.add_expand(
             output,
             buffer,
             self.lora_b_stacked,
@@ -172,7 +169,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py
index 0a252b425c4a8..ca4ad8012e9c3 100644
--- a/vllm/lora/layers/vocal_parallel_embedding.py
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -19,14 +18,14 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
     def __init__(self, base_layer: VocabParallelEmbedding) -> None:
         super().__init__()
         self.base_layer = base_layer
-        self.embeddings_slice: Optional[tuple[int, int]]
-        self.embeddings_weights: Optional[torch.Tensor]
+        self.embeddings_slice: tuple[int, int] | None
+        self.embeddings_weights: torch.Tensor | None
 
     def create_lora_weights(
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
+        model_config: PretrainedConfig | None = None,
     ) -> None:
         if self.base_layer.num_added_embeddings_per_partition > 0:
             # We can start adding lora weights
@@ -90,7 +89,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
+        embeddings_tensor: torch.Tensor | None,
     ):
         self.reset_lora(index)
         # NOTE self.lora_a_stacked is row-major, and lora_a is col-major,
@@ -143,7 +142,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
                 -1,
             )
 
-        lora_output: Optional[torch.Tensor] = self.punica_wrapper.add_lora_embedding(
+        lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_embedding(
             full_output, full_lora_a_embeddings, self.lora_b_stacked, add_input=True
         )
 
@@ -158,7 +157,7 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: Optional[PretrainedConfig],
+        model_config: PretrainedConfig | None,
     ) -> bool:
         return type(source_layer) is VocabParallelEmbedding
 
diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py
index b043a46f9e2a5..4a8b35aeb5b84 100644
--- a/vllm/lora/lora_weights.py
+++ b/vllm/lora/lora_weights.py
@@ -21,8 +21,8 @@ class LoRALayerWeights:
         lora_alpha: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor] = None,
-        scaling: Optional[float] = None,
+        embeddings_tensor: torch.Tensor | None = None,
+        scaling: float | None = None,
     ) -> None:
         self.module_name = module_name
         self.rank = rank
@@ -67,7 +67,7 @@ class LoRALayerWeights:
         cls,
         module_name: str,
         peft_helper: PEFTHelper,
-        embeddings_tensor: Optional[torch.Tensor] = None,
+        embeddings_tensor: torch.Tensor | None = None,
     ) -> "LoRALayerWeights":
         # lora_a and lora_b are set to None for config-based construction
         return cls(
@@ -89,7 +89,7 @@ class LoRALayerWeights:
         rank: int,
         dtype: torch.dtype,
         device: torch.types.Device,
-        embeddings_tensor_dim: Optional[int] = None,
+        embeddings_tensor_dim: int | None = None,
     ) -> "LoRALayerWeights":
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         lora_a = torch.zeros(
@@ -127,10 +127,10 @@ class PackedLoRALayerWeights(LoRALayerWeights):
         self,
         module_name: str,
         rank: int,
-        lora_alphas: list[Optional[int]],
-        lora_a: list[Optional[torch.Tensor]],
-        lora_b: list[Optional[torch.Tensor]],
-        scaling: Optional[list[float]] = None,
+        lora_alphas: list[int | None],
+        lora_a: list[torch.Tensor | None],
+        lora_b: list[torch.Tensor | None],
+        scaling: list[float] | None = None,
     ) -> None:
         super().__init__(
             module_name=module_name,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index cf9089eff1757..4840af7c7451b 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -3,7 +3,8 @@
 
 import math
 import os
-from typing import Callable, Optional, TypeVar, Union
+from collections.abc import Callable
+from typing import TypeVar
 
 import regex as re
 import safetensors.torch
@@ -44,7 +45,7 @@ class AdapterLRUCache(LRUCache[int, T]):
         super().__init__(capacity)
         self.deactivate_fn = deactivate_fn
 
-    def _on_remove(self, key: int, value: Optional[T]):
+    def _on_remove(self, key: int, value: T | None):
         logger.debug("Removing adapter int id: %d", key)
         self.deactivate_fn(key)
         return super()._on_remove(key, value)
@@ -113,7 +114,7 @@ class LoRAModel:
             else 0
         )
 
-    def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
+    def get_lora(self, module_name: str) -> LoRALayerWeights | None:
         """Get LoRA for a given module by name"""
         return self.loras.get(module_name, None)
 
@@ -128,12 +129,12 @@ class LoRAModel:
         tensors: dict[str, torch.Tensor],
         peft_helper: PEFTHelper,
         device: str = "cuda",
-        dtype: Optional[torch.dtype] = None,
-        embeddings: Optional[dict[str, torch.Tensor]] = None,
-        target_embedding_padding: Optional[int] = None,
-        embedding_modules: Optional[dict[str, str]] = None,
-        embedding_padding_modules: Optional[list[str]] = None,
-        weights_mapper: Optional[WeightsMapper] = None,
+        dtype: torch.dtype | None = None,
+        embeddings: dict[str, torch.Tensor] | None = None,
+        target_embedding_padding: int | None = None,
+        embedding_modules: dict[str, str] | None = None,
+        embedding_padding_modules: list[str] | None = None,
+        weights_mapper: WeightsMapper | None = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
@@ -191,14 +192,14 @@ class LoRAModel:
         expected_lora_modules: list[str],
         peft_helper: PEFTHelper,
         *,
-        lora_model_id: Optional[int] = None,
+        lora_model_id: int | None = None,
         device: str = "cuda",
-        dtype: Optional[torch.dtype] = None,
-        target_embedding_padding: Optional[int] = None,
-        embedding_modules: Optional[dict[str, str]] = None,
-        embedding_padding_modules: Optional[list[str]] = None,
-        weights_mapper: Optional[WeightsMapper] = None,
-        tensorizer_config_dict: Optional[dict] = None,
+        dtype: torch.dtype | None = None,
+        target_embedding_padding: int | None = None,
+        embedding_modules: dict[str, str] | None = None,
+        embedding_padding_modules: list[str] | None = None,
+        weights_mapper: WeightsMapper | None = None,
+        tensorizer_config_dict: dict | None = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a local checkpoint.
 
@@ -223,7 +224,7 @@ class LoRAModel:
         )
         new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
         tensors: dict[str, torch.Tensor] = {}
-        unexpected_modules: list[Union[list[str], str]] = []
+        unexpected_modules: list[list[str] | str] = []
 
         def check_unexpected_modules(modules: dict):
             for lora_module in modules.keys():  # noqa
@@ -357,7 +358,7 @@ class LoRAModelManager:
         self.max_num_seqs = max_num_seqs
         assert self.capacity >= self.lora_slots
         self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8
-        self.lora_index_to_id: list[Optional[int]] = [None] * self.lora_slots
+        self.lora_index_to_id: list[int | None] = [None] * self.lora_slots
         self.vocab_size = vocab_size
         self.punica_wrapper = get_punica_wrapper(
             max_num_batched_tokens,
@@ -383,7 +384,7 @@ class LoRAModelManager:
         self.packed_modules: dict[str, list[str]] = {}
         self.modules: dict[str, BaseLayerWithLoRA] = {}
         # Dict instead of a set for compatibility with LRUCache.
-        self._last_mapping: Optional[LoRAMapping] = None
+        self._last_mapping: LoRAMapping | None = None
         self._create_lora_modules()
         self.model.lora_manager = self
 
@@ -555,7 +556,7 @@ class LoRAModelManager:
         self,
         lora_id: int,
         rank: int,
-        embedding_modules: Optional[dict[str, str]] = None,
+        embedding_modules: dict[str, str] | None = None,
     ) -> LoRAModel:
         """Create zero-initialized LoRAModel for warmup."""
         model = LoRAModel(lora_id, rank, {})
@@ -607,7 +608,7 @@ class LoRAModelManager:
             else:
                 parts = module_name.split(".")
                 replacements = self.packed_modules_mapping[parts[-1]]
-                subloras: list[Optional[LoRALayerWeights]] = []
+                subloras: list[LoRALayerWeights | None] = []
                 for i, r in enumerate(replacements):
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name + "." + r,
@@ -658,7 +659,7 @@ class LoRAModelManager:
 
     def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
         for module_name, new_module_names in self.packed_modules.items():
-            replacement_loras: list[Optional[LoRALayerWeights]] = []
+            replacement_loras: list[LoRALayerWeights | None] = []
             replaced_module: set[str] = set()
             has_replacement = False
             for r in new_module_names:
@@ -687,7 +688,7 @@ class LoRAModelManager:
 
     def _get_lora_layer_weights(
         self, lora_model: LoRAModel, module_name: str
-    ) -> Optional[LoRALayerWeights]:
+    ) -> LoRALayerWeights | None:
         org_module_name = module_name
         if self.is_pooling_model and not lora_model.check_lora_name(module_name):
             # If it's a pool model, and the layer name is not found,
@@ -732,7 +733,7 @@ class LoRAModelManager:
     def list_adapters(self) -> dict[int, LoRAModel]:
         return dict(self._registered_adapters)
 
-    def get_adapter(self, adapter_id: int) -> Optional[LoRAModel]:
+    def get_adapter(self, adapter_id: int) -> LoRAModel | None:
         return self._registered_adapters.get(adapter_id)
 
 
diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
index df343305d710d..c3bef7680dd0d 100644
--- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
+++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
@@ -5,7 +5,6 @@ LoRA kernels metadata preparation utilities.
 """
 
 from dataclasses import dataclass
-from typing import Union
 
 import torch
 
@@ -31,7 +30,7 @@ class LoRAKernelMeta:
 
     @staticmethod
     def make(
-        max_loras: int, max_num_tokens: int, device: Union[torch.device, str]
+        max_loras: int, max_num_tokens: int, device: torch.device | str
     ) -> "LoRAKernelMeta":
         token_lora_mapping = torch.empty(
             max_num_tokens, dtype=torch.int32, device=device
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index 8f21a2570224e..975c3d8fc0a7f 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -7,7 +7,7 @@ import json
 import math
 import os
 from dataclasses import MISSING, dataclass, field, fields
-from typing import Literal, Optional, Union
+from typing import Literal
 
 from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
@@ -27,17 +27,17 @@ class PEFTHelper:
     # Required fields
     r: int
     lora_alpha: int
-    target_modules: Union[list[str], str]
+    target_modules: list[str] | str
 
     bias: Literal["none"] = field(default="none")
-    modules_to_save: Optional[list[str]] = field(default=None)
+    modules_to_save: list[str] | None = field(default=None)
     # True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732)
     use_rslora: bool = field(default=False)
     # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353)
     use_dora: bool = field(default=False)
     # Extra vllm field, start with 'vllm_' to avoid conflict
     vllm_lora_scaling_factor: float = field(default=1.0)
-    vllm_max_position_embeddings: Optional[int] = field(default=False)
+    vllm_max_position_embeddings: int | None = field(default=False)
 
     def _validate_features(self) -> list[str]:
         """
@@ -81,8 +81,8 @@ class PEFTHelper:
     def from_local_dir(
         cls,
         lora_path: str,
-        max_position_embeddings: Optional[int],
-        tensorizer_config_dict: Optional[dict] = None,
+        max_position_embeddings: int | None,
+        tensorizer_config_dict: dict | None = None,
     ) -> "PEFTHelper":
         lora_config_path = os.path.join(lora_path, "adapter_config.json")
 
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index b803a482b1bca..3f3f33baaa793 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -8,7 +8,7 @@ https://arxiv.org/abs/2310.18547
 """
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 import torch
 
@@ -28,7 +28,7 @@ class PunicaWrapperABC(ABC):
     def update_metadata(
         self,
         mapping: "LoRAMapping",
-        lora_index_to_id: list[Optional[int]],
+        lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
         extra_vocab_size: int,
@@ -42,12 +42,12 @@ class PunicaWrapperABC(ABC):
     @abstractmethod
     def add_shrink(
         self,
-        y: Union[tuple[torch.Tensor, ...], torch.Tensor],
+        y: tuple[torch.Tensor, ...] | torch.Tensor,
         x: torch.Tensor,
         lora_a_stacked: tuple[torch.Tensor, ...],
         scale: float,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Performs GEMM  for multiple slices of lora_a.
         """
@@ -58,13 +58,13 @@ class PunicaWrapperABC(ABC):
     def add_expand(
         self,
         y: torch.Tensor,
-        x: Union[tuple[torch.Tensor, ...], torch.Tensor],
+        x: tuple[torch.Tensor, ...] | torch.Tensor,
         lora_b_stacked: tuple[torch.Tensor, ...],
         output_slices: tuple[int, ...],
         offset_start: int = 0,
         add_inputs=True,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Performs GEMM for multiple slices of lora_b.
         """
@@ -78,7 +78,7 @@ class PunicaWrapperABC(ABC):
         lora_b_stacked: torch.Tensor,
         add_inputs: bool = True,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA,
         and this layer only requires the expand operation.
@@ -95,9 +95,9 @@ class PunicaWrapperABC(ABC):
         scale: float,
         output_slices: tuple[int, ...],
         *,
-        buffer: Optional[tuple[torch.Tensor, ...]] = None,
+        buffer: tuple[torch.Tensor, ...] | None = None,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Applicable to linear-related lora.
         """
@@ -113,9 +113,9 @@ class PunicaWrapperABC(ABC):
         lora_b_stacked: torch.Tensor,
         scale,
         *,
-        buffer: Optional[torch.Tensor] = None,
+        buffer: torch.Tensor | None = None,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Applies lora  specifically for LogitsProcessorWithLoRA.
         """
@@ -133,7 +133,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
         self,
         max_num_batched_tokens: int,
         max_batches: int,
-        device: Union[torch.device, str],
+        device: torch.device | str,
         **kwargs,
     ):
         self._token_lora_indices = torch.empty(
@@ -152,7 +152,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
         # 4 is the number of indices tensors.
         # base_indices, sampler_indices, sampler_indices_padded,
         # embeddings_indices
-        self.indices_len: list[Optional[int]] = [None] * 4
+        self.indices_len: list[int | None] = [None] * 4
         # these attributes are the information required for sgmv kernel
         self._seq_start_locs = torch.empty(max_batches, dtype=torch.long, device=device)
         self._seq_lengths = torch.empty(max_batches, dtype=torch.long, device=device)
@@ -169,7 +169,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
     def _update_base_metadata(
         self,
         mapping: "LoRAMapping",
-        lora_index_to_id: list[Optional[int]],
+        lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
         extra_vocab_size: int,
@@ -282,7 +282,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
     def update_metadata(
         self,
         mapping: "LoRAMapping",
-        lora_index_to_id: list[Optional[int]],
+        lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
         extra_vocab_size: int,
@@ -302,12 +302,12 @@ class PunicaWrapperBase(PunicaWrapperABC):
     @abstractmethod
     def add_shrink(
         self,
-        y: Union[tuple[torch.Tensor, ...], torch.Tensor],
+        y: tuple[torch.Tensor, ...] | torch.Tensor,
         x: torch.Tensor,
         lora_a_stacked: tuple[torch.Tensor, ...],
         scale: float,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Performs GEMM  for multiple slices of lora_a.
 
@@ -329,13 +329,13 @@ class PunicaWrapperBase(PunicaWrapperABC):
     def add_expand(
         self,
         y: torch.Tensor,
-        x: Union[tuple[torch.Tensor, ...], torch.Tensor],
+        x: tuple[torch.Tensor, ...] | torch.Tensor,
         lora_b_stacked: tuple[torch.Tensor, ...],
         output_slices: tuple[int, ...],
         offset_start: int = 0,
         add_inputs=True,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Performs GEMM for multiple slices of lora_b.
 
@@ -366,7 +366,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
         lora_b_stacked: torch.Tensor,
         add_inputs: bool = True,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
         and this layer only requires the expand operation.
@@ -392,9 +392,9 @@ class PunicaWrapperBase(PunicaWrapperABC):
         scale: float,
         output_slices: tuple[int, ...],
         *,
-        buffer: Optional[tuple[torch.Tensor, ...]] = None,
+        buffer: tuple[torch.Tensor, ...] | None = None,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Applicable to linear-related lora.
 
@@ -428,9 +428,9 @@ class PunicaWrapperBase(PunicaWrapperABC):
         lora_b_stacked: torch.Tensor,
         scale,
         *,
-        buffer: Optional[torch.Tensor] = None,
+        buffer: torch.Tensor | None = None,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Applies lora  specifically for LogitsProcessorWithLoRA.
 
diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py
index 93e64eb6ba843..1a700d9bf1f06 100644
--- a/vllm/lora/punica_wrapper/punica_cpu.py
+++ b/vllm/lora/punica_wrapper/punica_cpu.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import torch
 
@@ -30,7 +30,7 @@ class PunicaWrapperCPU(PunicaWrapperBase):
         self,
         max_num_batched_tokens: int,
         max_batches: int,
-        device: Union[torch.device, str],
+        device: torch.device | str,
         **kwargs,
     ):
         PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device)
@@ -165,7 +165,7 @@ class PunicaWrapperCPU(PunicaWrapperBase):
 
     def add_shrink(
         self,
-        y: Union[tuple[torch.Tensor, ...], torch.Tensor],
+        y: tuple[torch.Tensor, ...] | torch.Tensor,
         x: torch.Tensor,
         lora_a_stacked: tuple[torch.Tensor, ...],
         scale: float,
@@ -197,7 +197,7 @@ class PunicaWrapperCPU(PunicaWrapperBase):
     def add_expand(
         self,
         y: torch.Tensor,
-        x: Union[tuple[torch.Tensor, ...], torch.Tensor],
+        x: tuple[torch.Tensor, ...] | torch.Tensor,
         lora_b_stacked: tuple[torch.Tensor, ...],
         output_slices: tuple[int, ...],
         offset_start: int = 0,
@@ -271,7 +271,7 @@ class PunicaWrapperCPU(PunicaWrapperBase):
         scale: float,
         output_slices: tuple[int, ...],
         *,
-        buffer: Optional[tuple[torch.Tensor, ...]] = None,
+        buffer: tuple[torch.Tensor, ...] | None = None,
         **kwargs,
     ) -> None:
         """
@@ -319,7 +319,7 @@ class PunicaWrapperCPU(PunicaWrapperBase):
         lora_b_stacked: torch.Tensor,
         scale,
         *,
-        buffer: Optional[torch.Tensor] = None,
+        buffer: torch.Tensor | None = None,
         **kwargs,
     ) -> None:
         """
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 8173fe99ea13d..44a5443c30654 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -7,7 +7,7 @@ Punica: Multi-Tenant LoRA Serving.
 https://arxiv.org/abs/2310.18547
 """
 
-from typing import Optional, Union, final
+from typing import final
 
 import torch
 
@@ -32,7 +32,7 @@ class PunicaWrapperGPU(PunicaWrapperBase):
         self,
         max_num_batched_tokens: int,
         max_batches: int,
-        device: Union[torch.device, str],
+        device: torch.device | str,
         **kwargs,
     ):
         PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device)
@@ -50,7 +50,7 @@ class PunicaWrapperGPU(PunicaWrapperBase):
     def update_metadata(
         self,
         mapping: LoRAMapping,
-        lora_index_to_id: list[Optional[int]],
+        lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
         extra_vocab_size: int,
@@ -179,7 +179,7 @@ class PunicaWrapperGPU(PunicaWrapperBase):
         scale: float,
         output_slices: tuple[int, ...],
         *,
-        buffer: Optional[torch.Tensor] = None,
+        buffer: torch.Tensor | None = None,
         **kwargs,
     ) -> None:
         """
@@ -238,7 +238,7 @@ class PunicaWrapperGPU(PunicaWrapperBase):
         lora_b_stacked: torch.Tensor,
         scale,
         *,
-        buffer: Optional[torch.Tensor] = None,
+        buffer: torch.Tensor | None = None,
         **kwargs,
     ) -> None:
         """
diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py
index dff30d5d2a2d1..090878dcd2546 100644
--- a/vllm/lora/punica_wrapper/punica_tpu.py
+++ b/vllm/lora/punica_wrapper/punica_tpu.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 import torch
 import torch.nn.functional as F
@@ -29,7 +29,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         self,
         max_num_batched_tokens: int,
         max_batches: int,
-        device: Union[torch.device, str],
+        device: torch.device | str,
         **kwargs,
     ):
         PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device)
@@ -105,12 +105,12 @@ class PunicaWrapperTPU(PunicaWrapperBase):
 
     def add_shrink(
         self,
-        y: Union[tuple[torch.Tensor, ...], torch.Tensor],
+        y: tuple[torch.Tensor, ...] | torch.Tensor,
         x: torch.Tensor,
         lora_a_stacked: tuple[torch.Tensor, ...],
         scale: float,
         **kwargs,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Performs GEMM for multiple slices of lora_a.
 
@@ -137,7 +137,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
     def add_expand(
         self,
         y: torch.Tensor,
-        x: Union[tuple[torch.Tensor, ...], torch.Tensor],
+        x: tuple[torch.Tensor, ...] | torch.Tensor,
         lora_b_stacked: tuple[torch.Tensor, ...],
         output_slices: tuple[int, ...],
         offset_start: int = 0,
@@ -209,7 +209,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         scale: float,
         output_slices: tuple[int, ...],
         *,
-        buffer: Optional[tuple[torch.Tensor, ...]] = None,
+        buffer: tuple[torch.Tensor, ...] | None = None,
         **kwargs,
     ) -> torch.Tensor:
         """
@@ -257,7 +257,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
         lora_b_stacked: torch.Tensor,
         scale,
         *,
-        buffer: Optional[torch.Tensor] = None,
+        buffer: torch.Tensor | None = None,
         **kwargs,
     ) -> torch.Tensor:
         """
@@ -289,7 +289,7 @@ class PunicaWrapperTPU(PunicaWrapperBase):
     def _update_base_metadata(
         self,
         mapping: "LoRAMapping",
-        lora_index_to_id: list[Optional[int]],
+        lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
         extra_vocab_size: int,
diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py
index e3d03ac8dc2c2..b95087d0ff834 100644
--- a/vllm/lora/punica_wrapper/punica_xpu.py
+++ b/vllm/lora/punica_wrapper/punica_xpu.py
@@ -7,7 +7,7 @@ Punica: Multi-Tenant LoRA Serving.
 https://arxiv.org/abs/2310.18547
 """
 
-from typing import Optional, Union, final
+from typing import final
 
 import torch
 
@@ -29,7 +29,7 @@ class PunicaWrapperXPU(PunicaWrapperBase):
         self,
         max_num_batched_tokens: int,
         max_batches: int,
-        device: Union[torch.device, str],
+        device: torch.device | str,
         **kwargs,
     ):
         PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, device)
@@ -40,7 +40,7 @@ class PunicaWrapperXPU(PunicaWrapperBase):
     def update_metadata(
         self,
         mapping: LoRAMapping,
-        lora_index_to_id: list[Optional[int]],
+        lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
         extra_vocab_size: int,
@@ -180,7 +180,7 @@ class PunicaWrapperXPU(PunicaWrapperBase):
         scale: float,
         output_slices: tuple[int, ...],
         *,
-        buffer: Optional[torch.Tensor] = None,
+        buffer: torch.Tensor | None = None,
         **kwargs,
     ) -> None:
         """
@@ -247,7 +247,7 @@ class PunicaWrapperXPU(PunicaWrapperBase):
         lora_b_stacked: torch.Tensor,
         scale,
         *,
-        buffer: Optional[torch.Tensor] = None,
+        buffer: torch.Tensor | None = None,
         **kwargs,
     ) -> None:
         """
diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py
index 90d1614e674db..584745f86b1a8 100644
--- a/vllm/lora/punica_wrapper/utils.py
+++ b/vllm/lora/punica_wrapper/utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 import torch
 
@@ -51,7 +51,7 @@ def compute_meta(
 # TODO see if this can be vectorized
 def convert_mapping(
     mapping: "LoRAMapping",
-    lora_index_to_id: list[Optional[int]],
+    lora_index_to_id: list[int | None],
     max_loras: int,
     vocab_size: int,
     extra_vocab_size: int,
@@ -104,7 +104,7 @@ def convert_mapping(
         embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
         lora_indices[i] = lora_idx
 
-    indices_list: list[Union[list[int], torch.Tensor]] = [
+    indices_list: list[list[int] | torch.Tensor] = [
         index_mapping_indices,
         lora_indices,
         embedding_indices,
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index 650e060a5804d..c97e435e32165 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
-from typing import Optional
 
 import msgspec
 
@@ -27,10 +26,10 @@ class LoRARequest(
     lora_name: str
     lora_int_id: int
     lora_path: str = ""
-    lora_local_path: Optional[str] = msgspec.field(default=None)
-    long_lora_max_len: Optional[int] = None
-    base_model_name: Optional[str] = msgspec.field(default=None)
-    tensorizer_config_dict: Optional[dict] = None
+    lora_local_path: str | None = msgspec.field(default=None)
+    long_lora_max_len: int | None = None
+    base_model_name: str | None = msgspec.field(default=None)
+    tensorizer_config_dict: dict | None = None
 
     def __post_init__(self):
         if self.lora_int_id < 1:
diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py
index d366b94521cd8..bcfe26467cfb4 100644
--- a/vllm/lora/resolver.py
+++ b/vllm/lora/resolver.py
@@ -4,7 +4,6 @@
 from abc import ABC, abstractmethod
 from collections.abc import Set
 from dataclasses import dataclass, field
-from typing import Optional
 
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -24,7 +23,7 @@ class LoRAResolver(ABC):
     @abstractmethod
     async def resolve_lora(
         self, base_model_name: str, lora_name: str
-    ) -> Optional[LoRARequest]:
+    ) -> LoRARequest | None:
         """Abstract method to resolve and fetch a LoRA model adapter.
 
         Implements logic to locate and download LoRA adapter based on the name.
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 595c774e03be3..e61c5ae701233 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Optional
 
 import huggingface_hub
 import regex as re
@@ -66,7 +66,7 @@ def from_layer(
     max_loras: int,
     lora_config: LoRAConfig,
     packed_modules_list: list,
-    model_config: Optional[PretrainedConfig] = None,
+    model_config: PretrainedConfig | None = None,
 ) -> nn.Module:
     for lora_cls in _all_lora_classes:
         # specifying kwargs so they can be easily accessed in decorator
@@ -87,7 +87,7 @@ def from_layer_logits_processor(
     lm_head: "ParallelLMHead",
     max_loras: int,
     lora_config: LoRAConfig,
-    model_config: Optional[PretrainedConfig] = None,
+    model_config: PretrainedConfig | None = None,
 ) -> LogitsProcessorWithLoRA:
     ret = LogitsProcessorWithLoRA(
         layer,
@@ -155,7 +155,7 @@ def parse_fine_tuned_lora_name(
 
 
 def is_regex_target_modules(
-    load_modules: Union[str, list[str]], expected_lora_modules: list[str]
+    load_modules: str | list[str], expected_lora_modules: list[str]
 ) -> bool:
     """
     PEFT supports passing `target_modules` in the form of regular expressions,
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 3ca819fb732cf..635685079b2d7 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal
 
 import torch
 
@@ -40,7 +40,7 @@ class WorkerLoRAManager:
         self._lora_model_cls = lora_model_cls
         self.embedding_modules = embedding_modules
         self.embedding_padding_modules = embedding_padding_modules
-        self._cached_dummy_lora: Union[None, Literal[False], LoRAModel] = False
+        self._cached_dummy_lora: None | Literal[False] | LoRAModel = False
         self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs
         self.max_num_batched_tokens = (
             vllm_config.scheduler_config.max_num_batched_tokens
@@ -166,7 +166,7 @@ class WorkerLoRAManager:
     def pin_adapter(self, adapter_id: int) -> bool:
         return self._adapter_manager.pin_adapter(adapter_id)
 
-    def set_active_adapters(self, requests: set[Any], mapping: Optional[Any]) -> None:
+    def set_active_adapters(self, requests: set[Any], mapping: Any | None) -> None:
         self._apply_adapters(requests)
         if mapping is not None:
             self._adapter_manager.set_adapter_mapping(mapping)
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index ad5a09ca970d6..7f75066f2c36f 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch.nn as nn
 
@@ -171,7 +170,7 @@ class CustomOp(nn.Module):
     # or
     # - @CustomOP.register_oot(name="UnquantizedFusedMoEMethod")
     @classmethod
-    def register_oot(cls, _decorated_op_cls=None, name: Optional[str] = None):
+    def register_oot(cls, _decorated_op_cls=None, name: str | None = None):
         def decorator(op_cls):
             reg_name = name if name is not None else cls.__name__
             assert reg_name not in cls.op_registry_oot, f"Duplicate op name: {reg_name}"
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 96745b99f7a7e..50548d2e1afa8 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -3,7 +3,6 @@
 """Custom activation functions."""
 
 import math
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -486,7 +485,7 @@ class ScaledActivation(nn.Module):
         act_module: nn.Module,
         intermediate_size: int,
         input_is_parallel: bool = True,
-        params_dtype: Optional[torch.dtype] = None,
+        params_dtype: torch.dtype | None = None,
     ):
         super().__init__()
         self.act = act_module
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 9fd85d1e9e194..9a82d098167b5 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -4,7 +4,7 @@ import contextlib
 import os
 from collections import namedtuple
 from collections.abc import Callable
-from typing import Any, Union
+from typing import Any
 
 import torch
 
@@ -138,7 +138,7 @@ def matmul_kernel_persistent(
 
 
 def matmul_persistent(
-    a: torch.Tensor, b: torch.Tensor, bias: Union[torch.Tensor, None] = None
+    a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None = None
 ):
     # Check constraints.
     assert a.shape[1] == b.shape[0], "Incompatible dimensions"
@@ -375,7 +375,7 @@ def mean_dim(
     input: torch.Tensor,
     dim: int,
     keepdim: bool = False,
-    dtype: Union[torch.dtype, None] = None,
+    dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     """
     Triton implementation of torch.mean with single dimension reduction.
@@ -475,9 +475,7 @@ def _log_softmax_batch_invariant(input, dim, _half_to_float):
     return log_softmax(input, dim=dim)
 
 
-def mean_batch_invariant(
-    input, dim, keepdim=False, dtype: Union[torch.dtype, None] = None
-):
+def mean_batch_invariant(input, dim, keepdim=False, dtype: torch.dtype | None = None):
     assert dtype is None or dtype == torch.float32, f"unsupported dtype: {dtype}"
 
     result = input.to(torch.float32)
diff --git a/vllm/model_executor/layers/fla/ops/chunk.py b/vllm/model_executor/layers/fla/ops/chunk.py
index d65c87aba11cd..b046a6d3919e9 100644
--- a/vllm/model_executor/layers/fla/ops/chunk.py
+++ b/vllm/model_executor/layers/fla/ops/chunk.py
@@ -8,7 +8,6 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 # ruff: noqa: E501
 import warnings
-from typing import Optional
 
 import torch
 from einops import rearrange
@@ -32,7 +31,7 @@ def chunk_gated_delta_rule_fwd(
     scale: float,
     initial_state: torch.Tensor,
     output_final_state: bool,
-    cu_seqlens: Optional[torch.LongTensor] = None,
+    cu_seqlens: torch.LongTensor | None = None,
 ):
     g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens)
     # obtain WY representation. u is actually the new v.
@@ -86,7 +85,7 @@ class ChunkGatedDeltaRuleFunction(torch.autograd.Function):
         scale: float,
         initial_state: torch.Tensor,
         output_final_state: bool,
-        cu_seqlens: Optional[torch.LongTensor] = None,
+        cu_seqlens: torch.LongTensor | None = None,
         use_qk_l2norm_in_kernel: bool = False,
     ):
         if use_qk_l2norm_in_kernel:
@@ -119,7 +118,7 @@ def chunk_gated_delta_rule(
     scale: float = None,
     initial_state: torch.Tensor = None,
     output_final_state: bool = False,
-    cu_seqlens: Optional[torch.LongTensor] = None,
+    cu_seqlens: torch.LongTensor | None = None,
     head_first: bool = False,
     use_qk_l2norm_in_kernel: bool = False,
 ):
diff --git a/vllm/model_executor/layers/fla/ops/chunk_delta_h.py b/vllm/model_executor/layers/fla/ops/chunk_delta_h.py
index 817962d9c9465..1c14f84c2b895 100644
--- a/vllm/model_executor/layers/fla/ops/chunk_delta_h.py
+++ b/vllm/model_executor/layers/fla/ops/chunk_delta_h.py
@@ -7,7 +7,6 @@
 # the following copyright notice:
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 # ruff: noqa: E501
-from typing import Optional
 
 import torch
 
@@ -257,12 +256,12 @@ def chunk_gated_delta_rule_fwd_h(
     k: torch.Tensor,
     w: torch.Tensor,
     u: torch.Tensor,
-    g: Optional[torch.Tensor] = None,
-    initial_state: Optional[torch.Tensor] = None,
+    g: torch.Tensor | None = None,
+    initial_state: torch.Tensor | None = None,
     output_final_state: bool = False,
     chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
     save_new_value: bool = True,
-    cu_seqlens: Optional[torch.LongTensor] = None,
+    cu_seqlens: torch.LongTensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     B, T, Hg, K, V = *k.shape, u.shape[-1]
     H = u.shape[-2]
diff --git a/vllm/model_executor/layers/fla/ops/chunk_o.py b/vllm/model_executor/layers/fla/ops/chunk_o.py
index ae404a3615f61..4e8e04c1d48ca 100644
--- a/vllm/model_executor/layers/fla/ops/chunk_o.py
+++ b/vllm/model_executor/layers/fla/ops/chunk_o.py
@@ -9,7 +9,6 @@
 
 # ruff: noqa: E501
 
-from typing import Optional
 
 import torch
 
@@ -144,9 +143,9 @@ def chunk_fwd_o(
     k: torch.Tensor,
     v: torch.Tensor,
     h: torch.Tensor,
-    g: Optional[torch.Tensor] = None,  # cumsum of log decay
-    scale: Optional[float] = None,
-    cu_seqlens: Optional[torch.LongTensor] = None,
+    g: torch.Tensor | None = None,  # cumsum of log decay
+    scale: float | None = None,
+    cu_seqlens: torch.LongTensor | None = None,
     chunk_size: int = 64,
 ) -> torch.Tensor:
     B, T, Hg, K, V = *q.shape, v.shape[-1]
diff --git a/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py b/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py
index 0da3f243901fb..975e119af333e 100644
--- a/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py
+++ b/vllm/model_executor/layers/fla/ops/chunk_scaled_dot_kkt.py
@@ -7,7 +7,6 @@
 # the following copyright notice:
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 # ruff: noqa: E501
-from typing import Optional
 
 import torch
 
@@ -104,8 +103,8 @@ def chunk_scaled_dot_kkt_fwd_kernel(
 def chunk_scaled_dot_kkt_fwd(
     k: torch.Tensor,
     beta: torch.Tensor,
-    g_cumsum: Optional[torch.Tensor] = None,
-    cu_seqlens: Optional[torch.LongTensor] = None,
+    g_cumsum: torch.Tensor | None = None,
+    cu_seqlens: torch.LongTensor | None = None,
     chunk_size: int = 64,
     output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/fla/ops/cumsum.py b/vllm/model_executor/layers/fla/ops/cumsum.py
index cfa2b3b48e709..99b41794796d8 100644
--- a/vllm/model_executor/layers/fla/ops/cumsum.py
+++ b/vllm/model_executor/layers/fla/ops/cumsum.py
@@ -8,7 +8,6 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 # ruff: noqa: E501
 import warnings
-from typing import Optional
 
 import torch
 
@@ -163,9 +162,9 @@ def chunk_local_cumsum_scalar(
     g: torch.Tensor,
     chunk_size: int,
     reverse: bool = False,
-    cu_seqlens: Optional[torch.Tensor] = None,
+    cu_seqlens: torch.Tensor | None = None,
     head_first: bool = False,
-    output_dtype: Optional[torch.dtype] = torch.float,
+    output_dtype: torch.dtype | None = torch.float,
 ) -> torch.Tensor:
     if head_first:
         B, H, T = g.shape
@@ -200,9 +199,9 @@ def chunk_local_cumsum_vector(
     g: torch.Tensor,
     chunk_size: int,
     reverse: bool = False,
-    cu_seqlens: Optional[torch.Tensor] = None,
+    cu_seqlens: torch.Tensor | None = None,
     head_first: bool = False,
-    output_dtype: Optional[torch.dtype] = torch.float,
+    output_dtype: torch.dtype | None = torch.float,
 ) -> torch.Tensor:
     if head_first:
         B, H, T, S = g.shape
@@ -248,9 +247,9 @@ def chunk_local_cumsum(
     g: torch.Tensor,
     chunk_size: int,
     reverse: bool = False,
-    cu_seqlens: Optional[torch.Tensor] = None,
+    cu_seqlens: torch.Tensor | None = None,
     head_first: bool = False,
-    output_dtype: Optional[torch.dtype] = torch.float,
+    output_dtype: torch.dtype | None = torch.float,
     **kwargs,
 ) -> torch.Tensor:
     if not head_first and g.shape[1] < g.shape[2]:
diff --git a/vllm/model_executor/layers/fla/ops/fused_recurrent.py b/vllm/model_executor/layers/fla/ops/fused_recurrent.py
index fa10bdb36caa3..f3de1bfa28219 100644
--- a/vllm/model_executor/layers/fla/ops/fused_recurrent.py
+++ b/vllm/model_executor/layers/fla/ops/fused_recurrent.py
@@ -7,7 +7,6 @@
 # the following copyright notice:
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 # ruff: noqa: E501
-from typing import Optional
 
 import torch
 
@@ -169,9 +168,9 @@ def fused_recurrent_gated_delta_rule_fwd(
     scale: float,
     initial_state: torch.Tensor,
     inplace_final_state: bool = True,
-    cu_seqlens: Optional[torch.LongTensor] = None,
-    ssm_state_indices: Optional[torch.Tensor] = None,
-    num_accepted_tokens: Optional[torch.Tensor] = None,
+    cu_seqlens: torch.LongTensor | None = None,
+    ssm_state_indices: torch.Tensor | None = None,
+    num_accepted_tokens: torch.Tensor | None = None,
     use_qk_l2norm_in_kernel: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     B, T, H, K, V = *k.shape, v.shape[-1]
@@ -248,9 +247,9 @@ class FusedRecurrentFunction(torch.autograd.Function):
         scale: float,
         initial_state: torch.Tensor,
         inplace_final_state: bool = True,
-        cu_seqlens: Optional[torch.LongTensor] = None,
-        ssm_state_indices: Optional[torch.Tensor] = None,
-        num_accepted_tokens: Optional[torch.Tensor] = None,
+        cu_seqlens: torch.LongTensor | None = None,
+        ssm_state_indices: torch.Tensor | None = None,
+        num_accepted_tokens: torch.Tensor | None = None,
         use_qk_l2norm_in_kernel: bool = False,
     ):
         o, final_state = fused_recurrent_gated_delta_rule_fwd(
@@ -280,9 +279,9 @@ def fused_recurrent_gated_delta_rule(
     scale: float = None,
     initial_state: torch.Tensor = None,
     inplace_final_state: bool = True,
-    cu_seqlens: Optional[torch.LongTensor] = None,
-    ssm_state_indices: Optional[torch.Tensor] = None,
-    num_accepted_tokens: Optional[torch.Tensor] = None,
+    cu_seqlens: torch.LongTensor | None = None,
+    ssm_state_indices: torch.Tensor | None = None,
+    num_accepted_tokens: torch.Tensor | None = None,
     use_qk_l2norm_in_kernel: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     r"""
diff --git a/vllm/model_executor/layers/fla/ops/l2norm.py b/vllm/model_executor/layers/fla/ops/l2norm.py
index 315dd904523b8..4d7dbb5100681 100644
--- a/vllm/model_executor/layers/fla/ops/l2norm.py
+++ b/vllm/model_executor/layers/fla/ops/l2norm.py
@@ -8,7 +8,6 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
 import os
-from typing import Optional
 
 import torch
 
@@ -90,7 +89,7 @@ def l2norm_fwd_kernel2(X, Y, eps, M, N: tl.constexpr, MBLOCK: tl.constexpr):
 
 
 def l2norm_fwd(
-    x: torch.Tensor, eps: float = 1e-6, output_dtype: Optional[torch.dtype] = None
+    x: torch.Tensor, eps: float = 1e-6, output_dtype: torch.dtype | None = None
 ):
     x_shape_og = x.shape
     x = x.view(-1, x.shape[-1])
diff --git a/vllm/model_executor/layers/fla/ops/layernorm_guard.py b/vllm/model_executor/layers/fla/ops/layernorm_guard.py
index 6d039efe58767..307d0859c24e5 100644
--- a/vllm/model_executor/layers/fla/ops/layernorm_guard.py
+++ b/vllm/model_executor/layers/fla/ops/layernorm_guard.py
@@ -14,7 +14,6 @@
 # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
 
 from functools import lru_cache
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -324,10 +323,10 @@ class LayerNormGated(nn.Module):
         self,
         hidden_size,
         eps: float = 1e-5,
-        group_size: Optional[int] = None,
+        group_size: int | None = None,
         norm_before_gate: bool = True,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
     ):
         """If group_size is not None, we do GroupNorm with each group having group_size elements.
         group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
@@ -364,10 +363,10 @@ class RMSNormGated(nn.Module):
         self,
         hidden_size,
         eps: float = 1e-5,
-        group_size: Optional[int] = None,
+        group_size: int | None = None,
         norm_before_gate: bool = False,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
     ):
         """If group_size is not None, we do GroupNorm with each group having group_size elements.
         group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
diff --git a/vllm/model_executor/layers/fla/ops/solve_tril.py b/vllm/model_executor/layers/fla/ops/solve_tril.py
index d30fea90aec38..010beba19dbe3 100644
--- a/vllm/model_executor/layers/fla/ops/solve_tril.py
+++ b/vllm/model_executor/layers/fla/ops/solve_tril.py
@@ -7,7 +7,6 @@
 # the following copyright notice:
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 # ruff: noqa: E501
-from typing import Optional
 
 import torch
 
@@ -407,7 +406,7 @@ def merge_16x16_to_64x64_inverse_kernel(
 @input_guard
 def solve_tril(
     A: torch.Tensor,
-    cu_seqlens: Optional[torch.Tensor] = None,
+    cu_seqlens: torch.Tensor | None = None,
     output_dtype: torch.dtype = torch.float,
 ) -> torch.Tensor:
     """
diff --git a/vllm/model_executor/layers/fla/ops/utils.py b/vllm/model_executor/layers/fla/ops/utils.py
index 07124f33f1e66..1ed82c6086bb2 100644
--- a/vllm/model_executor/layers/fla/ops/utils.py
+++ b/vllm/model_executor/layers/fla/ops/utils.py
@@ -11,8 +11,9 @@ import contextlib
 import functools
 import logging
 import os
+from collections.abc import Callable
 from enum import Enum
-from typing import Any, Callable, Literal, Optional
+from typing import Any, Literal
 
 import torch
 
@@ -43,7 +44,7 @@ def tensor_cache(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]
             A wrapped version of the input function with single-entry caching.
     """
 
-    cache_entries: tuple[Optional[tuple], Optional[dict], Any] = []
+    cache_entries: tuple[tuple | None, dict | None, Any] = []
     cache_size = 4
 
     @functools.wraps(fn)
diff --git a/vllm/model_executor/layers/fla/ops/wy_fast.py b/vllm/model_executor/layers/fla/ops/wy_fast.py
index b628a90e843f8..a66ec1d60d668 100644
--- a/vllm/model_executor/layers/fla/ops/wy_fast.py
+++ b/vllm/model_executor/layers/fla/ops/wy_fast.py
@@ -8,7 +8,6 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
 # ruff: noqa: E501
-from typing import Optional
 
 import torch
 
@@ -123,7 +122,7 @@ def recompute_w_u_fwd(
     beta: torch.Tensor,
     g_cumsum: torch.Tensor,
     A: torch.Tensor,
-    cu_seqlens: Optional[torch.LongTensor],
+    cu_seqlens: torch.LongTensor | None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     B, T, Hg, K, V = *k.shape, v.shape[-1]
     H = v.shape[-2]
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 799f782848944..247919dcc8440 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from contextlib import contextmanager
-from typing import Any, Optional
+from typing import Any
 
 from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
 from vllm.model_executor.layers.fused_moe.layer import (
@@ -19,7 +19,7 @@ from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
 from vllm.triton_utils import HAS_TRITON
 
-_config: Optional[dict[str, Any]] = None
+_config: dict[str, Any] | None = None
 
 
 @contextmanager
@@ -31,7 +31,7 @@ def override_config(config):
     _config = old_config
 
 
-def get_config() -> Optional[dict[str, Any]]:
+def get_config() -> dict[str, Any] | None:
     return _config
 
 
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index 35d2dcb91d253..91ce7e30199d2 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -259,7 +258,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # FIXME (varun): We should be able to dispatch only from the leader
         # DP ranks in the case of TP > 1. At the moment, all the Ranks
@@ -282,12 +281,12 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         assert expert_tokens_meta is not None
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
index 09c4de0f87159..1b1af351a449e 100644
--- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -110,7 +109,7 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_metadata: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_metadata: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
@@ -148,12 +147,12 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         experts = (
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 5780c969d273a..377116124522c 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -34,8 +34,8 @@ def _get_config_dtype_str(
     use_fp8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
-    ocp_mx_scheme: Optional[str] = None,
-) -> Optional[str]:
+    ocp_mx_scheme: str | None = None,
+) -> str | None:
     """
     Return a string used to construct the filename that contains the
     tuning info for a particular quantization scheme.  See
@@ -60,16 +60,16 @@ def _get_config_dtype_str(
 
 
 def _quant_flags_to_group_shape(
-    quant_dtype: Union[torch.dtype, str, None],
+    quant_dtype: torch.dtype | str | None,
     per_act_token_quant: bool,
     per_out_ch_quant: bool,
-    block_shape: Optional[list[int]],
-) -> tuple[Optional[GroupShape], Optional[GroupShape]]:
+    block_shape: list[int] | None,
+) -> tuple[GroupShape | None, GroupShape | None]:
     """
     Convert MoE quantization flags into more generic GroupShapes.
     """
-    a_shape: Optional[GroupShape]
-    w_shape: Optional[GroupShape]
+    a_shape: GroupShape | None
+    w_shape: GroupShape | None
     if block_shape is not None:
         assert not per_act_token_quant
         assert not per_out_ch_quant
@@ -100,7 +100,7 @@ class FusedMoEQuantDesc:
     # The quantized type of this parameters.  None means unquantized or
     # already quantized.
     # TODO (bnell): use scalar_type instead of Union.
-    dtype: Union[torch.dtype, str, None] = None
+    dtype: torch.dtype | str | None = None
 
     # A field that describes the quantization group shape, from quant_utils.py.
     #  * (-1, -1)   for per-tensor quantization
@@ -109,7 +109,7 @@ class FusedMoEQuantDesc:
     #  * (128, 128) for 128x128 deepseek style block quantization
     #  * (1, 128)   for deepseek style activation quantization
     #               (i.e. per-token-per-group)
-    shape: Optional[GroupShape] = None
+    shape: GroupShape | None = None
 
     # Quantization scales.
     # TODO(bnell): maybe put PrecisionConfigs in subclass of QuantDesc?
@@ -117,13 +117,13 @@ class FusedMoEQuantDesc:
 
     # Quantization alphas or gscales, used for nvfp4 types.
     # TODO(bnell): put some of these in subclasses
-    alpha_or_gscale: Optional[torch.Tensor] = None
+    alpha_or_gscale: torch.Tensor | None = None
 
     # Zero points for int4/int8 types
-    zp: Optional[torch.Tensor] = None
+    zp: torch.Tensor | None = None
 
     # Biases for GPT triton MoE
-    bias: Optional[torch.Tensor] = None
+    bias: torch.Tensor | None = None
 
 
 # TODO(bnell): have subclasses for specific moe methods?
@@ -179,7 +179,7 @@ class FusedMoEQuantConfig:
     #
 
     @property
-    def quant_dtype(self) -> Union[torch.dtype, str, None]:
+    def quant_dtype(self) -> torch.dtype | str | None:
         return self._a1.dtype
 
     @property
@@ -203,7 +203,7 @@ class FusedMoEQuantConfig:
         return self._a1.shape == GroupShape.PER_TENSOR
 
     @property
-    def block_shape(self) -> Optional[list[int]]:
+    def block_shape(self) -> list[int] | None:
         if (
             self._a1.shape is not None
             and self._a1.shape != GroupShape.PER_TENSOR
@@ -218,34 +218,34 @@ class FusedMoEQuantConfig:
         return self.block_shape is not None
 
     @property
-    def a1_scale(self) -> Optional[torch.Tensor]:
+    def a1_scale(self) -> torch.Tensor | None:
         assert self._a1.scale is None or isinstance(self._a1.scale, torch.Tensor)
         return self._a1.scale
 
     @property
-    def a1_gscale(self) -> Optional[torch.Tensor]:
+    def a1_gscale(self) -> torch.Tensor | None:
         return self._a1.alpha_or_gscale
 
     @property
-    def a2_scale(self) -> Optional[torch.Tensor]:
+    def a2_scale(self) -> torch.Tensor | None:
         assert self._a2.scale is None or isinstance(self._a2.scale, torch.Tensor)
         return self._a2.scale
 
     @property
-    def a2_gscale(self) -> Optional[torch.Tensor]:
+    def a2_gscale(self) -> torch.Tensor | None:
         return self._a2.alpha_or_gscale
 
     @property
-    def w1_scale(self) -> Optional[torch.Tensor]:
+    def w1_scale(self) -> torch.Tensor | None:
         assert self._w1.scale is None or isinstance(self._w1.scale, torch.Tensor)
         return self._w1.scale
 
     @property
-    def w1_zp(self) -> Optional[torch.Tensor]:
+    def w1_zp(self) -> torch.Tensor | None:
         return self._w1.zp
 
     @property
-    def w1_bias(self) -> Optional[torch.Tensor]:
+    def w1_bias(self) -> torch.Tensor | None:
         return self._w1.bias
 
     @property
@@ -254,20 +254,20 @@ class FusedMoEQuantConfig:
         return self._w1.scale
 
     @property
-    def g1_alphas(self) -> Optional[torch.Tensor]:
+    def g1_alphas(self) -> torch.Tensor | None:
         return self._w1.alpha_or_gscale
 
     @property
-    def w2_scale(self) -> Optional[torch.Tensor]:
+    def w2_scale(self) -> torch.Tensor | None:
         assert self._w2.scale is None or isinstance(self._w2.scale, torch.Tensor)
         return self._w2.scale
 
     @property
-    def w2_zp(self) -> Optional[torch.Tensor]:
+    def w2_zp(self) -> torch.Tensor | None:
         return self._w2.zp
 
     @property
-    def w2_bias(self) -> Optional[torch.Tensor]:
+    def w2_bias(self) -> torch.Tensor | None:
         return self._w2.bias
 
     @property
@@ -276,7 +276,7 @@ class FusedMoEQuantConfig:
         return self._w2.scale
 
     @property
-    def g2_alphas(self) -> Optional[torch.Tensor]:
+    def g2_alphas(self) -> torch.Tensor | None:
         return self._w2.alpha_or_gscale
 
     @property
@@ -296,7 +296,7 @@ class FusedMoEQuantConfig:
         return self._a1.dtype is None and self._w1.dtype == "int4"
 
     @property
-    def ocp_mx_scheme(self) -> Union[str, None]:
+    def ocp_mx_scheme(self) -> str | None:
         if not hasattr(self, "_ocp_mx_scheme"):
             if (self._a1.dtype is not None and not isinstance(self._a1.dtype, str)) or (
                 self._w1.dtype is not None and not isinstance(self._w1.dtype, str)
@@ -322,7 +322,7 @@ class FusedMoEQuantConfig:
     def use_nvfp4_w4a4(self) -> bool:
         return self.quant_dtype == "nvfp4"
 
-    def config_name(self, dtype: torch.dtype) -> Optional[str]:
+    def config_name(self, dtype: torch.dtype) -> str | None:
         """
         Return a string used to construct the filename that contains the
         tuning info for a particular quantization scheme.  See
@@ -340,7 +340,7 @@ class FusedMoEQuantConfig:
         self,
         max_tokens: int,
         hidden_dim: int,
-    ) -> Optional[tuple[int, int]]:
+    ) -> tuple[int, int] | None:
         """
         Construct the proper activation scale shape for this
         config.
@@ -363,7 +363,7 @@ class FusedMoEQuantConfig:
         num_experts: int,
         max_tokens: int,
         hidden_dim: int,
-    ) -> Optional[tuple[int, int, int]]:
+    ) -> tuple[int, int, int] | None:
         """
         Construct the proper activation batched scale shape for this
         config, e.g. (num experts, *scale_shape).
@@ -377,23 +377,23 @@ class FusedMoEQuantConfig:
 
     @staticmethod
     def make(
-        quant_dtype: Union[torch.dtype, str, None] = None,
+        quant_dtype: torch.dtype | str | None = None,
         per_act_token_quant: bool = False,
         per_out_ch_quant: bool = False,
-        block_shape: Optional[list[int]] = None,
+        block_shape: list[int] | None = None,
         w1_scale: Union[torch.Tensor, "PrecisionConfig", None] = None,
         w2_scale: Union[torch.Tensor, "PrecisionConfig", None] = None,
-        a1_scale: Optional[torch.Tensor] = None,
-        a2_scale: Optional[torch.Tensor] = None,
-        g1_alphas: Optional[torch.Tensor] = None,
-        g2_alphas: Optional[torch.Tensor] = None,
-        a1_gscale: Optional[torch.Tensor] = None,
-        a2_gscale: Optional[torch.Tensor] = None,
-        w1_bias: Optional[torch.Tensor] = None,
-        w2_bias: Optional[torch.Tensor] = None,
-        w1_zp: Optional[torch.Tensor] = None,
-        w2_zp: Optional[torch.Tensor] = None,
-        weight_dtype: Union[torch.dtype, str, None] = None,
+        a1_scale: torch.Tensor | None = None,
+        a2_scale: torch.Tensor | None = None,
+        g1_alphas: torch.Tensor | None = None,
+        g2_alphas: torch.Tensor | None = None,
+        a1_gscale: torch.Tensor | None = None,
+        a2_gscale: torch.Tensor | None = None,
+        w1_bias: torch.Tensor | None = None,
+        w2_bias: torch.Tensor | None = None,
+        w1_zp: torch.Tensor | None = None,
+        w2_zp: torch.Tensor | None = None,
+        weight_dtype: torch.dtype | str | None = None,
     ) -> "FusedMoEQuantConfig":
         """
         General builder function for a FusedMoEQuantConfig.
@@ -457,11 +457,11 @@ class FusedMoEQuantConfig:
 def fp8_w8a8_moe_quant_config(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
     per_act_token_quant: bool = False,
     per_out_ch_quant: bool = False,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for fp8 activations and fp8 weights.
@@ -481,8 +481,8 @@ def fp8_w8a8_moe_quant_config(
 def int8_w8a8_moe_quant_config(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
-    a1_scale: Optional[torch.Tensor],
-    a2_scale: Optional[torch.Tensor],
+    a1_scale: torch.Tensor | None,
+    a2_scale: torch.Tensor | None,
     per_act_token_quant: bool = False,
 ) -> FusedMoEQuantConfig:
     """
@@ -503,8 +503,8 @@ def int8_w8a8_moe_quant_config(
 def mxfp4_w4a16_moe_quant_config(
     w1_scale: Union[torch.Tensor, "PrecisionConfig"],
     w2_scale: Union[torch.Tensor, "PrecisionConfig"],
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for unquantized activations and mxfp4 weights.
@@ -521,12 +521,12 @@ def ocp_mx_moe_quant_config(
     quant_dtype: str,
     w1_scale: Union[torch.Tensor, "PrecisionConfig"],
     w2_scale: Union[torch.Tensor, "PrecisionConfig"],
-    weight_dtype: Optional[str] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
+    weight_dtype: str | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for mxfp4 activations and mxfp4 weights.
@@ -575,9 +575,9 @@ def nvfp4_moe_quant_config(
 def int4_w4a16_moe_quant_config(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
-    w1_zp: Optional[torch.Tensor],
-    w2_zp: Optional[torch.Tensor],
-    block_shape: Optional[list[int]] = None,
+    w1_zp: torch.Tensor | None,
+    w2_zp: torch.Tensor | None,
+    block_shape: list[int] | None = None,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for 16-bit float activations and int4 weights.
@@ -595,9 +595,9 @@ def int4_w4a16_moe_quant_config(
 def int8_w8a16_moe_quant_config(
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
-    w1_zp: Optional[torch.Tensor],
-    w2_zp: Optional[torch.Tensor],
-    block_shape: Optional[list[int]] = None,
+    w1_zp: torch.Tensor | None,
+    w2_zp: torch.Tensor | None,
+    block_shape: list[int] | None = None,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for 16-bit float activations and int8 weights.
@@ -613,8 +613,8 @@ def int8_w8a16_moe_quant_config(
 
 
 def biased_moe_quant_config(
-    w1_bias: Optional[torch.Tensor],
-    w2_bias: Optional[torch.Tensor],
+    w1_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
 ) -> FusedMoEQuantConfig:
     """
     Construct a quant config for unquantized activations with biases.
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 3592a88b0ef2f..552d9e9cf88f3 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from torch.nn import functional as F
@@ -33,7 +33,7 @@ def grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
     routed_scaling_factor: float = 1.0,
-    e_score_correction_bias: Optional[torch.Tensor] = None,
+    e_score_correction_bias: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
 
@@ -88,12 +88,12 @@ def select_experts(
     top_k: int,
     use_grouped_topk: bool,
     renormalize: bool,
-    topk_group: Optional[int] = None,
-    num_expert_group: Optional[int] = None,
-    custom_routing_function: Optional[Callable] = None,
+    topk_group: int | None = None,
+    num_expert_group: int | None = None,
+    custom_routing_function: Callable | None = None,
     scoring_func: str = "softmax",
     routed_scaling_factor: float = 1.0,
-    e_score_correction_bias: Optional[torch.Tensor] = None,
+    e_score_correction_bias: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     if use_grouped_topk:
         assert topk_group is not None
@@ -147,14 +147,14 @@ class IPEXFusedMOE:
         top_k: int,
         router_logits: torch.Tensor,
         renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
@@ -189,14 +189,14 @@ class SGLFusedMOE:
         top_k: int,
         router_logits: torch.Tensor,
         renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
@@ -247,14 +247,14 @@ class CPUFusedMOE:
         top_k: int,
         router_logits: torch.Tensor,
         renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index fa158287d418d..e08ed8fa886f7 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """CUTLASS based Fused MoE kernels."""
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 
@@ -35,23 +35,23 @@ def run_cutlass_moe_fp8(
     topk_ids: torch.Tensor,
     activation_callable: Callable,
     global_num_experts: int,
-    expert_map: Optional[torch.Tensor],
-    w1_scale: Optional[torch.Tensor],
-    w2_scale: Optional[torch.Tensor],
-    a1q_scale: Optional[torch.Tensor],
-    a2_scale: Optional[torch.Tensor],
+    expert_map: torch.Tensor | None,
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
+    a1q_scale: torch.Tensor | None,
+    a2_scale: torch.Tensor | None,
     ab_strides1: torch.Tensor,
     ab_strides2: torch.Tensor,
     c_strides1: torch.Tensor,
     c_strides2: torch.Tensor,
     workspace13: torch.Tensor,
     workspace2: torch.Tensor,
-    expert_num_tokens: Optional[torch.Tensor],
+    expert_num_tokens: torch.Tensor | None,
     out_dtype: torch.dtype,
     per_act_token: bool,
     per_out_ch: bool,
     use_batched_format: bool,
-    topk_weights: Optional[torch.Tensor],
+    topk_weights: torch.Tensor | None,
 ):
     a1q = hidden_states
 
@@ -249,7 +249,7 @@ def run_cutlass_moe_fp8(
 class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
     def __init__(
         self,
-        out_dtype: Optional[torch.dtype],
+        out_dtype: torch.dtype | None,
         ab_strides1: torch.Tensor,
         ab_strides2: torch.Tensor,
         c_strides1: torch.Tensor,
@@ -278,12 +278,12 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         assert self.w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
@@ -331,7 +331,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
 class CutlassExpertsFp8(CutlassExpertsFp8Base):
     def __init__(
         self,
-        out_dtype: Optional[torch.dtype],
+        out_dtype: torch.dtype | None,
         ab_strides1: torch.Tensor,
         ab_strides2: torch.Tensor,
         c_strides1: torch.Tensor,
@@ -377,7 +377,7 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         workspace1 = (M * topk, max(N, K))
         workspace2 = (M * topk, max(N // 2, K))
@@ -390,7 +390,7 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
         self,
         max_experts_per_worker: int,
         num_dispatchers: int,
-        out_dtype: Optional[torch.dtype],
+        out_dtype: torch.dtype | None,
         ab_strides1: torch.Tensor,
         ab_strides2: torch.Tensor,
         c_strides1: torch.Tensor,
@@ -435,7 +435,7 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         num_dp = self.num_dispatchers
         assert num_dp is not None
@@ -457,7 +457,7 @@ def cutlass_moe_fp8(
     c_strides2: torch.Tensor,
     quant_config: FusedMoEQuantConfig,
     activation: str = "silu",
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
 ) -> torch.Tensor:
@@ -768,7 +768,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         workspace1: tuple[int, ...] = ()
         workspace2: tuple[int, ...] = ()
@@ -793,12 +793,12 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],  # unused
-        a2_scale: Optional[torch.Tensor],  # unused
-        workspace13: Optional[torch.Tensor],
-        workspace2: Optional[torch.Tensor],
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,  # unused
+        a2_scale: torch.Tensor | None,  # unused
+        workspace13: torch.Tensor | None,
+        workspace2: torch.Tensor | None,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         e, m, n, k, _ = self.moe_problem_size(hidden_states, w1, w2, topk_ids)
@@ -839,7 +839,7 @@ def cutlass_moe_fp4(
     n: int,
     k: int,
     e: int,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
 ) -> torch.Tensor:
     assert expert_map is None, (
@@ -896,7 +896,7 @@ def _valid_cutlass_block_scaled_grouped_gemm(
     inplace: bool,
     activation: str,
     apply_router_weight_on_input: bool,
-    expert_map: Optional[torch.Tensor],
+    expert_map: torch.Tensor | None,
 ) -> bool:
     def _valid_cutlass_block_scaled_grouped_gemm_shape(N: int, K: int):
         return N % 128 == 0 and K % 128 == 0
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index fc0cb5c530da6..350c21e0a95bc 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 from tqdm import tqdm
@@ -204,7 +203,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         assert self.block_shape is not None
         block_m = self.block_shape[0]
@@ -228,12 +227,12 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         assert a1q_scale is not None
@@ -284,7 +283,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
         self.activation(activation, act_out, mm1_out.view(-1, N))
 
-        a2q_scale: Optional[torch.Tensor] = None
+        a2q_scale: torch.Tensor | None = None
         a2q, a2q_scale = per_token_group_quant_fp8(
             act_out, self.block_shape[1], column_major_scales=True, out_q=quant_out
         )
@@ -317,9 +316,9 @@ def deep_gemm_moe_fp8(
     inplace: bool = False,
     activation: str = "silu",
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
     apply_router_weight_on_input=False,
 ) -> torch.Tensor:
     """
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
index 2ac968a9b4ab4..570c5ec09d2d3 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
@@ -6,7 +6,6 @@ and updated to fit vllm needs and terminology.
 """
 
 import functools
-from typing import Optional
 
 import torch
 
@@ -39,7 +38,7 @@ def compute_aligned_M(
     num_topk: int,
     local_num_experts: int,
     alignment: int,
-    expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    expert_tokens_meta: mk.ExpertTokensMetadata | None,
 ):
     if (expert_tokens_meta is not None) and (
         expert_tokens_meta.expert_num_tokens_cpu is not None
@@ -175,7 +174,7 @@ def ep_scatter(
     recv_x_scale: torch.Tensor,
     recv_topk: torch.Tensor,
     num_recv_tokens_per_expert: torch.Tensor,
-    expert_map: Optional[torch.Tensor],
+    expert_map: torch.Tensor | None,
     expert_start_loc: torch.Tensor,
     output_tensor: torch.Tensor,
     output_tensor_scale: torch.Tensor,
@@ -305,7 +304,7 @@ def ep_gather(
     recv_topk_ids: torch.Tensor,
     recv_topk_weight: torch.Tensor,
     input_index: torch.Tensor,
-    expert_map: Optional[torch.Tensor],
+    expert_map: torch.Tensor | None,
     output_tensor: torch.Tensor,
 ):
     num_warps = 2
@@ -346,9 +345,9 @@ def deepgemm_moe_permute(
     aq_scale: torch.Tensor,
     topk_ids: torch.Tensor,
     local_num_experts: int,
-    expert_map: Optional[torch.Tensor],
-    expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-    aq_out: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None,
+    expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    aq_out: torch.Tensor | None = None,
 ):
     assert aq.ndim == 2
     assert topk_ids.dtype.is_signed, "The kernel uses -1 to represent invalid topk_ids"
@@ -415,7 +414,7 @@ def deepgemm_unpermute_and_reduce(
     topk_ids: torch.Tensor,
     topk_weights: torch.Tensor,
     inv_perm: torch.Tensor,
-    expert_map: Optional[torch.Tensor],
+    expert_map: torch.Tensor | None,
     output: torch.Tensor,
 ):
     return ep_gather(
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index 85c4fd90dc6c1..40cc6d2cee988 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import deep_ep
 import torch
@@ -77,18 +77,18 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
 
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         return None
 
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         return torch.int64
 
-    def _get_dispatch_config(self) -> Optional[deep_ep.Config]:
+    def _get_dispatch_config(self) -> deep_ep.Config | None:
         if self.num_dispatchers_ not in self.available_rank_configs:
             return None
         return deep_ep.Buffer.get_dispatch_config(self.num_dispatchers_)
 
-    def _get_combine_config(self) -> Optional[deep_ep.Config]:
+    def _get_combine_config(self) -> deep_ep.Config | None:
         if self.num_dispatchers_ not in self.available_rank_configs:
             return None
         return deep_ep.Buffer.get_combine_config(self.num_dispatchers_)
@@ -96,11 +96,11 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def _do_dispatch(
         self,
         tokens: torch.Tensor,
-        token_scales: Optional[torch.Tensor],
+        token_scales: torch.Tensor | None,
         rank_topk_ids: torch.Tensor,
         rank_topk_weights: torch.Tensor,
         num_experts: int,
-        a1_scale: Optional[torch.Tensor],
+        a1_scale: torch.Tensor | None,
         quant_config: FusedMoEQuantConfig,
     ) -> Callable:
         has_scales = token_scales is not None
@@ -175,12 +175,12 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self,
         event: deep_ep.EventOverlap,
         has_scales: bool,
-        token_data: Union[tuple[torch.Tensor, torch.Tensor], torch.Tensor],
-        expert_topk_ids: Optional[torch.Tensor],
+        token_data: tuple[torch.Tensor, torch.Tensor] | torch.Tensor,
+        expert_topk_ids: torch.Tensor | None,
         num_experts: int,
         expert_num_tokens_per_expert_list: list[int],
-        expert_topk_weights: Optional[torch.Tensor],
-        a1_scale: Optional[torch.Tensor],
+        expert_topk_weights: torch.Tensor | None,
+        a1_scale: torch.Tensor | None,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
         if event.event is not None:
@@ -249,7 +249,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.ReceiverType:
@@ -294,7 +294,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
@@ -318,7 +318,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         apply_router_weight_on_input: bool,
         weight_and_reduce_impl: mk.TopKWeightAndReduce,
         do_async: bool,
-    ) -> Optional[Callable]:
+    ) -> Callable | None:
         a2a_idx = dbo_current_ubatch_id()
         handle = self.handles[a2a_idx]
         assert handle is not None
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 117bfe6e6b4d7..b3ba2e308953a 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import deep_ep
 import torch
@@ -67,7 +67,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         # The dispatch function returns a handle that the combine function
         # requires. We store the handle here so it is available to the
         # combine function.
-        self.handles: list[Optional[tuple]] = [None, None]
+        self.handles: list[tuple | None] = [None, None]
         self.num_dispatchers_ = num_dispatchers
 
     def num_dispatchers(self) -> int:
@@ -80,18 +80,18 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.BatchedExperts
 
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         return self.max_tokens_per_rank
 
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         return torch.int64
 
     def _do_quant(
         self,
-        x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        x: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         a1_dtype: torch.dtype,
         quant_config: FusedMoEQuantConfig,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         if self.use_fp8_dispatch:
             block_k = (
                 quant_config.block_shape[1]
@@ -137,7 +137,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> tuple[Callable, mk.ReceiverType]:
@@ -200,9 +200,9 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
     def _receiver(
         self,
-        expert_x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        expert_x: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         expert_num_tokens: torch.Tensor,
-        a1_scale: Optional[torch.Tensor],
+        a1_scale: torch.Tensor | None,
         a1_dtype: torch.dtype,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
@@ -220,7 +220,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 1b33c7075fb36..b7820319682be 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -96,7 +95,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # We use global_num_experts due to how moe_align_block_size handles
         # expert_maps.
@@ -133,13 +132,13 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
-        workspace13: Optional[torch.Tensor],
-        workspace2: Optional[torch.Tensor],
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-        apply_router_weight_on_input: Optional[bool],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor | None,
+        workspace2: torch.Tensor | None,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool | None,
     ):
         assert activation == "silu", (
             "Only activation silu is supported in FlashInferExperts"
@@ -207,7 +206,7 @@ def flashinfer_cutlass_moe_fp4(
     inplace: bool = False,
     activation: str = "silu",
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
 ) -> torch.Tensor:
     fused_experts = mk.FusedMoEModularKernel(
@@ -242,7 +241,7 @@ def flashinfer_cutlass_moe(
     inplace: bool = False,
     activation: str = "silu",
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
     tp_rank: int = 0,
     tp_size: int = 1,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index 4907b9ff5730b..20e2f6c851861 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -39,10 +38,10 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
 
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         return None
 
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         return None
 
     def num_dispatchers(self) -> int:
@@ -89,7 +88,7 @@ class FlashInferAllToAllMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFina
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
@@ -164,7 +163,7 @@ class FlashInferAllGatherMoEPrepareAndFinalize(FlashInferCutlassMoEPrepareAndFin
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
index d12d05915566d..698d12d5eaddb 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -105,7 +104,7 @@ direct_register_custom_op(
 
 def flashinfer_fused_moe_per_tensor_scale_fp8(
     routing_logits: torch.Tensor,
-    routing_bias: Optional[torch.Tensor],
+    routing_bias: torch.Tensor | None,
     hidden_states: torch.Tensor,
     input_scale: torch.Tensor,
     gemm1_weights: torch.Tensor,
@@ -115,8 +114,8 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
     output2_scales_scalar: torch.Tensor,
     num_experts: int,
     top_k: int,
-    num_expert_group: Optional[int],
-    topk_group: Optional[int],
+    num_expert_group: int | None,
+    topk_group: int | None,
     intermediate_size: int,
     local_expert_offset: int,
     local_num_experts: int,
@@ -163,7 +162,7 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
 
 def flashinfer_fused_moe_per_tensor_scale_fp8_fake(
     routing_logits: torch.Tensor,
-    routing_bias: Optional[torch.Tensor],
+    routing_bias: torch.Tensor | None,
     hidden_states: torch.Tensor,
     input_scale: torch.Tensor,
     gemm1_weights: torch.Tensor,
@@ -173,8 +172,8 @@ def flashinfer_fused_moe_per_tensor_scale_fp8_fake(
     output2_scales_scalar: torch.Tensor,
     num_experts: int,
     top_k: int,
-    num_expert_group: Optional[int],
-    topk_group: Optional[int],
+    num_expert_group: int | None,
+    topk_group: int | None,
     intermediate_size: int,
     local_expert_offset: int,
     local_num_experts: int,
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index 0c31684d23677..7fd8511e297de 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused batched MoE kernel."""
 
-from typing import Optional
-
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -370,8 +368,8 @@ def invoke_moe_batched_triton_kernel(
     expert_num_tokens: torch.Tensor,  # [E]
     compute_type: tl.dtype,
     # Quantization data
-    A_scale: Optional[torch.Tensor],
-    B_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
+    B_scale: torch.Tensor | None,
     B_zp: torch.Tensor,
     # Quantization schemes
     use_fp8_w8a8: bool,
@@ -379,7 +377,7 @@ def invoke_moe_batched_triton_kernel(
     use_int4_w4a16: bool,
     config: dict[str, int],
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ):
     assert not use_int4_w4a16
     max_num_tokens = A.size(1)
@@ -500,10 +498,10 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.BatchedExperts
 
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         return self.max_num_tokens
 
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         return None
 
     def num_dispatchers(self) -> int:
@@ -518,7 +516,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
@@ -674,7 +672,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         num_dp = self.num_dispatchers
         num_experts = local_num_experts
@@ -701,12 +699,12 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         assert hidden_states.dim() == 3
@@ -754,15 +752,15 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
 def batched_moe_kernel_quantize_input(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     num_tokens: int,
     E: int,
     N: int,
     expert_num_tokens: torch.Tensor,
-    qtype: Optional[torch.dtype],
+    qtype: torch.dtype | None,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]] = None,
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    block_shape: list[int] | None = None,
+) -> tuple[torch.Tensor, torch.Tensor | None]:
     if torch.compiler.is_compiling() or torch.cuda.is_current_stream_capturing():
         # Note: this does a bunch of extra work because expert_num_tokens is
         # ignored but it does support torch.compile + cudagraphs.
@@ -868,7 +866,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         num_dp = self.num_dispatchers
         num_experts = local_num_experts
@@ -888,12 +886,12 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         # Check constraints.
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 6412c3eaa1932..58ed826ba037b 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused MoE utilities for GPTQ."""
 
-from typing import Optional
-
 import torch
 from typing_extensions import override
 
@@ -28,31 +26,31 @@ def fused_marlin_moe(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
     w2: torch.Tensor,
-    bias1: Optional[torch.Tensor],
-    bias2: Optional[torch.Tensor],
+    bias1: torch.Tensor | None,
+    bias2: torch.Tensor | None,
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
-    gating_output: Optional[torch.Tensor],
+    gating_output: torch.Tensor | None,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
     quant_type_id: int,
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
-    activation: Optional[str] = "silu",
-    expert_map: Optional[torch.Tensor] = None,
-    global_scale1: Optional[torch.Tensor] = None,
-    global_scale2: Optional[torch.Tensor] = None,
-    g_idx1: Optional[torch.Tensor] = None,
-    g_idx2: Optional[torch.Tensor] = None,
-    sort_indices1: Optional[torch.Tensor] = None,
-    sort_indices2: Optional[torch.Tensor] = None,
-    w1_zeros: Optional[torch.Tensor] = None,
-    w2_zeros: Optional[torch.Tensor] = None,
-    workspace: Optional[torch.Tensor] = None,
-    intermediate_cache13: Optional[torch.Tensor] = None,
-    intermediate_cache2: Optional[torch.Tensor] = None,
+    activation: str | None = "silu",
+    expert_map: torch.Tensor | None = None,
+    global_scale1: torch.Tensor | None = None,
+    global_scale2: torch.Tensor | None = None,
+    g_idx1: torch.Tensor | None = None,
+    g_idx2: torch.Tensor | None = None,
+    sort_indices1: torch.Tensor | None = None,
+    sort_indices2: torch.Tensor | None = None,
+    w1_zeros: torch.Tensor | None = None,
+    w2_zeros: torch.Tensor | None = None,
+    workspace: torch.Tensor | None = None,
+    intermediate_cache13: torch.Tensor | None = None,
+    intermediate_cache2: torch.Tensor | None = None,
     is_k_full: bool = True,
-    output: Optional[torch.Tensor] = None,
+    output: torch.Tensor | None = None,
     inplace: bool = False,
 ) -> torch.Tensor:
     """
@@ -249,26 +247,26 @@ def fused_marlin_moe_fake(
     w2: torch.Tensor,
     w1_scale: torch.Tensor,
     w2_scale: torch.Tensor,
-    gating_output: Optional[torch.Tensor],
+    gating_output: torch.Tensor | None,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
     quant_type_id: int,
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
-    global_scale1: Optional[torch.Tensor] = None,
-    global_scale2: Optional[torch.Tensor] = None,
-    expert_map: Optional[torch.Tensor] = None,
-    g_idx1: Optional[torch.Tensor] = None,
-    g_idx2: Optional[torch.Tensor] = None,
-    sort_indices1: Optional[torch.Tensor] = None,
-    sort_indices2: Optional[torch.Tensor] = None,
-    w1_zeros: Optional[torch.Tensor] = None,
-    w2_zeros: Optional[torch.Tensor] = None,
-    workspace: Optional[torch.Tensor] = None,
-    intermediate_cache13: Optional[torch.Tensor] = None,
-    intermediate_cache2: Optional[torch.Tensor] = None,
+    global_scale1: torch.Tensor | None = None,
+    global_scale2: torch.Tensor | None = None,
+    expert_map: torch.Tensor | None = None,
+    g_idx1: torch.Tensor | None = None,
+    g_idx2: torch.Tensor | None = None,
+    sort_indices1: torch.Tensor | None = None,
+    sort_indices2: torch.Tensor | None = None,
+    w1_zeros: torch.Tensor | None = None,
+    w2_zeros: torch.Tensor | None = None,
+    workspace: torch.Tensor | None = None,
+    intermediate_cache13: torch.Tensor | None = None,
+    intermediate_cache2: torch.Tensor | None = None,
     is_k_full: bool = True,
-    output: Optional[torch.Tensor] = None,
+    output: torch.Tensor | None = None,
     inplace: bool = False,
 ) -> torch.Tensor:
     return torch.empty_like(hidden_states)
@@ -341,7 +339,7 @@ class MarlinExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # Modular Kernel provisions output buffer from workspace1. However in
         # the fused_marlin_moe() function, the final torch.sum(), is defined
@@ -374,12 +372,12 @@ class MarlinExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         assert self.w1_scale is not None
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index eda825ffcae1e..9f66e47dcb96c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -5,7 +5,8 @@
 import functools
 import json
 import os
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -539,10 +540,10 @@ def invoke_fused_moe_kernel(
     A: torch.Tensor,
     B: torch.Tensor,
     C: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
-    B_scale: Optional[torch.Tensor],
-    B_zp: Optional[torch.Tensor],
-    topk_weights: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
+    B_scale: torch.Tensor | None,
+    B_zp: torch.Tensor | None,
+    topk_weights: torch.Tensor | None,
     sorted_token_ids: torch.Tensor,
     expert_ids: torch.Tensor,
     num_tokens_post_padded: torch.Tensor,
@@ -555,8 +556,8 @@ def invoke_fused_moe_kernel(
     use_int8_w8a16: bool,
     use_int4_w4a16: bool,
     per_channel_quant: bool,
-    block_shape: Optional[list[int]] = None,
-    B_bias: Optional[torch.Tensor] = None,
+    block_shape: list[int] | None = None,
+    B_bias: torch.Tensor | None = None,
 ) -> None:
     assert topk_weights is not None or not mul_routed_weight
     assert topk_weights is None or topk_weights.stride(1) == 1
@@ -808,7 +809,7 @@ def zero_experts_compute_triton(
 
 # Adapted from: https://github.com/sgl-project/sglang/pull/2628
 def get_config_file_name(
-    E: int, N: int, dtype: Optional[str], block_shape: Optional[list[int]] = None
+    E: int, N: int, dtype: str | None, block_shape: list[int] | None = None
 ) -> str:
     device_name = current_platform.get_device_name().replace(" ", "_")
     dtype_selector = "" if not dtype else f",dtype={dtype}"
@@ -823,10 +824,10 @@ def get_config_file_name(
 def get_moe_configs(
     E: int,
     N: int,
-    dtype: Optional[str],
-    block_n: Optional[int] = None,
-    block_k: Optional[int] = None,
-) -> Optional[dict[int, Any]]:
+    dtype: str | None,
+    block_n: int | None = None,
+    block_k: int | None = None,
+) -> dict[int, Any] | None:
     """
     Return optimized configurations for the fused MoE kernel.
 
@@ -965,8 +966,8 @@ def get_default_config(
     N: int,
     K: int,
     topk: int,
-    dtype: Optional[str],
-    block_shape: Optional[list[int]] = None,
+    dtype: str | None,
+    block_shape: list[int] | None = None,
 ) -> dict[str, int]:
     if dtype == "fp8_w8a8" and block_shape is not None:
         # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0]
@@ -1016,9 +1017,9 @@ def try_get_optimal_moe_config(
     w1_shape: tuple[int, ...],
     w2_shape: tuple[int, ...],
     top_k: int,
-    dtype: Optional[str],
+    dtype: str | None,
     M: int,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> dict[str, int]:
     from vllm.model_executor.layers.fused_moe import get_config
 
@@ -1076,7 +1077,7 @@ def fused_topk(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
-    indices_type: Optional[torch.dtype] = None,
+    indices_type: torch.dtype | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert hidden_states.size(0) == gating_output.size(0), "Number of tokens mismatch"
 
@@ -1135,7 +1136,7 @@ def grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
     routed_scaling_factor: float = 1.0,
-    e_score_correction_bias: Optional[torch.Tensor] = None,
+    e_score_correction_bias: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     if (
         envs.VLLM_USE_FUSED_MOE_GROUPED_TOPK
@@ -1211,7 +1212,7 @@ def eplb_map_to_physical_and_record(
     expert_load_view: torch.Tensor,
     logical_to_physical_map: torch.Tensor,
     logical_replica_count: torch.Tensor,
-    indices_type: Optional[torch.dtype] = None,
+    indices_type: torch.dtype | None = None,
 ) -> torch.Tensor:
     """
     Map the logical expert ids to physical expert ids
@@ -1326,19 +1327,19 @@ def inplace_fused_experts(
     use_int8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
-    ocp_mx_scheme: Optional[str] = None,
+    ocp_mx_scheme: str | None = None,
     per_channel_quant: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_zp: Optional[torch.Tensor] = None,
-    w2_zp: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    w1_zp: torch.Tensor | None = None,
+    w2_zp: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
 ) -> None:
     fused_experts_impl(
         hidden_states,
@@ -1381,19 +1382,19 @@ def inplace_fused_experts_fake(
     use_int8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
-    ocp_mx_scheme: Optional[str] = None,
+    ocp_mx_scheme: str | None = None,
     per_channel_quant: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_zp: Optional[torch.Tensor] = None,
-    w2_zp: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    w1_zp: torch.Tensor | None = None,
+    w2_zp: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
 ) -> None:
     pass
 
@@ -1423,19 +1424,19 @@ def outplace_fused_experts(
     use_int8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
-    ocp_mx_scheme: Optional[str] = None,
+    ocp_mx_scheme: str | None = None,
     per_channel_quant: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_zp: Optional[torch.Tensor] = None,
-    w2_zp: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    w1_zp: torch.Tensor | None = None,
+    w2_zp: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     return fused_experts_impl(
         hidden_states,
@@ -1477,19 +1478,19 @@ def outplace_fused_experts_fake(
     use_int8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
-    ocp_mx_scheme: Optional[str] = None,
+    ocp_mx_scheme: str | None = None,
     per_channel_quant: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_zp: Optional[torch.Tensor] = None,
-    w2_zp: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    w1_zp: torch.Tensor | None = None,
+    w2_zp: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
@@ -1534,8 +1535,8 @@ def fused_experts(
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    quant_config: Optional[FusedMoEQuantConfig] = None,
+    expert_map: torch.Tensor | None = None,
+    quant_config: FusedMoEQuantConfig | None = None,
     allow_deep_gemm: bool = False,
     allow_cutlass_block_scaled_grouped_gemm: bool = False,
 ) -> torch.Tensor:
@@ -1625,8 +1626,8 @@ GELU_NO_MUL: str = activation_without_mul("gelu")
 def _get_config_quant_dtype(
     use_fp8_w8a8: bool,
     use_int8_w8a8: bool,
-    ocp_mx_scheme: Optional[str],
-) -> Union[None, torch.dtype, str]:
+    ocp_mx_scheme: str | None,
+) -> None | torch.dtype | str:
     """
     Get the quantization type based on the quantization strategy flags.
     We don't have a quant_config at this point so we need to work backwards.
@@ -1660,19 +1661,19 @@ def fused_experts_impl(
     use_int8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
-    ocp_mx_scheme: Optional[str] = None,
+    ocp_mx_scheme: str | None = None,
     per_channel_quant: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_zp: Optional[torch.Tensor] = None,
-    w2_zp: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
-    w1_bias: Optional[torch.Tensor] = None,
-    w2_bias: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    w1_zp: torch.Tensor | None = None,
+    w2_zp: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    block_shape: list[int] | None = None,
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     # Check constraints.
     if use_int4_w4a16:
@@ -1964,7 +1965,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         workspace1 = (M, topk, max(N // 2, K))
         workspace2 = (M, topk, max(N, K))
@@ -1981,12 +1982,12 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         # Check constraints.
@@ -2074,7 +2075,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
             activation, intermediate_cache2, intermediate_cache1.view(-1, N)
         )
 
-        a2q_scale: Optional[torch.Tensor] = None
+        a2q_scale: torch.Tensor | None = None
 
         qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
             intermediate_cache2,
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 283ce80556d26..01fa9b99379b6 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -80,10 +79,10 @@ def triton_kernel_moe_forward(
     topk: int,
     renormalize: bool,
     activation: str = "silu",
-    quant_config: Optional[FusedMoEQuantConfig] = None,
+    quant_config: FusedMoEQuantConfig | None = None,
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
 ) -> torch.Tensor:
     routing_data, gather_idx, scatter_idx = routing(
         gating_output, topk, sm_first=not renormalize
@@ -115,13 +114,13 @@ def triton_kernel_fused_experts(
     gather_indx,  # GatherIndx
     scatter_indx,  # ScatterIndx
     activation: str = "silu",
-    quant_config: Optional[FusedMoEQuantConfig] = None,
+    quant_config: FusedMoEQuantConfig | None = None,
     swiglu_alpha: float = 1.702,
     swiglu_limit: float = 7.0,
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    a1q_scale: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
+    a1q_scale: torch.Tensor | None = None,
 ) -> torch.Tensor:
     if quant_config is None:
         quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
@@ -261,7 +260,7 @@ class OAITritonExperts(BaseOAITritonExperts):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # workspace are allocated inside the kernel
         workspace1 = (M, K)
@@ -279,12 +278,12 @@ class OAITritonExperts(BaseOAITritonExperts):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         if expert_map is not None:
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 94a733aa03b93..9b117f3b5d418 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -2,10 +2,10 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import abstractmethod
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from contextlib import nullcontext
 from enum import Enum
-from typing import Callable, Literal, Optional, Union, get_args, overload
+from typing import Literal, get_args, overload
 
 import torch
 import torch.nn.functional as F
@@ -70,15 +70,15 @@ if current_platform.is_cuda_alike():
         )
 else:
     fused_experts = None  # type: ignore
-    FusedMoEPermuteExpertsUnpermute = None  # type: ignore
-    FusedMoEPrepareAndFinalize = None  # type: ignore
+    FusedMoEPermuteExpertsUnpermute = object  # type: ignore
+    FusedMoEPrepareAndFinalize = object  # type: ignore
 
     def _eplb_map_to_physical_and_record(
         topk_ids: torch.Tensor,
         expert_load_view: torch.Tensor,
         logical_to_physical_map: torch.Tensor,
         logical_replica_count: torch.Tensor,
-        indices_type: Optional[torch.dtype],
+        indices_type: torch.dtype | None,
     ) -> torch.Tensor:
         # CPU fallback: no EPLB so just return as is
         return topk_ids
@@ -110,8 +110,8 @@ class FusedMoEMethodBase(QuantizeMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__()
         self.moe = moe
-        self.moe_quant_config: Optional[FusedMoEQuantConfig] = None
-        self.fused_experts: Optional[FusedMoEModularKernel] = None
+        self.moe_quant_config: FusedMoEQuantConfig | None = None
+        self.fused_experts: FusedMoEModularKernel | None = None
         self.topk_indices_dtype = None
 
     @abstractmethod
@@ -139,12 +139,12 @@ class FusedMoEMethodBase(QuantizeMethodBase):
     @staticmethod
     def _maybe_make_prepare_finalize(
         moe: FusedMoEConfig,
-        quant_config: Optional[FusedMoEQuantConfig],
-    ) -> Optional[FusedMoEPrepareAndFinalize]:
+        quant_config: FusedMoEQuantConfig | None,
+    ) -> FusedMoEPrepareAndFinalize | None:
         all2all_manager = get_ep_group().device_communicator.all2all_manager
         assert all2all_manager is not None
 
-        prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None
+        prepare_finalize: FusedMoEPrepareAndFinalize | None = None
 
         # TODO: could allow this now
         assert not moe.use_flashinfer_cutlass_kernels, "Must be created in modelopt.py"
@@ -229,7 +229,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
 
         return prepare_finalize
 
-    def maybe_make_prepare_finalize(self) -> Optional[FusedMoEPrepareAndFinalize]:
+    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
         if self.moe.moe_parallel_config.use_all2all_kernels:
             return FusedMoEMethodBase._maybe_make_prepare_finalize(
                 self.moe, self.moe_quant_config
@@ -280,7 +280,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
     @abstractmethod
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         raise NotImplementedError
 
     @property
@@ -296,21 +296,21 @@ class FusedMoEMethodBase(QuantizeMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError
 
 
@@ -368,7 +368,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                 )
             self.flashinfer_cutlass_moe = None  # type: ignore
 
-    def maybe_make_prepare_finalize(self) -> Optional[FusedMoEPrepareAndFinalize]:
+    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
         if self.rocm_aiter_moe_enabled:
             return None
         else:
@@ -532,21 +532,21 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             assert expert_load_view is not None
             assert logical_to_physical_map is not None
@@ -578,7 +578,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         if self.moe.has_bias:
             return biased_moe_quant_config(
                 layer.w13_bias,
@@ -595,21 +595,21 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         top_k: int,
         router_logits: torch.Tensor,
         renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         zero_expert_num = getattr(layer, "zero_expert_num", 0)
         zero_expert_type = getattr(layer, "zero_expert_type", None)
 
@@ -705,21 +705,21 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         top_k: int,
         router_logits: torch.Tensor,
         renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if (
             enable_eplb is not False
             or expert_load_view is not None
@@ -754,21 +754,21 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         top_k: int,
         router_logits: torch.Tensor,
         renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if (
             enable_eplb is not False
             or expert_load_view is not None
@@ -795,21 +795,21 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         top_k: int,
         router_logits: torch.Tensor,
         renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert not use_grouped_topk
         assert num_expert_group is None
         assert topk_group is None
@@ -860,7 +860,7 @@ def determine_expert_map(
     ep_rank: int,
     global_num_experts: int,
     expert_placement_strategy: ExpertPlacementStrategy = "linear",
-) -> tuple[int, Optional[torch.Tensor]]:
+) -> tuple[int, torch.Tensor | None]:
     """
     Calculates how many experts should be assigned to each rank for EP and
     creates a mapping from global to local expert index. Experts are
@@ -941,7 +941,7 @@ def get_compressed_expert_map(expert_map: torch.Tensor) -> str:
 def maybe_roundup_hidden_size(
     hidden_size: int,
     act_dtype: torch.dtype,
-    quant_config: Optional[QuantizationConfig],
+    quant_config: QuantizationConfig | None,
     moe_parallel_config: FusedMoEParallelConfig,
 ) -> int:
     """
@@ -1016,30 +1016,30 @@ class FusedMoE(CustomOp):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
+        params_dtype: torch.dtype | None = None,
         reduce_results: bool = False,
         renormalize: bool = True,
         use_grouped_topk: bool = False,
-        num_expert_group: Optional[int] = None,
-        topk_group: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        tp_size: Optional[int] = None,
-        ep_size: Optional[int] = None,
-        dp_size: Optional[int] = None,
+        num_expert_group: int | None = None,
+        topk_group: int | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
+        ep_size: int | None = None,
+        dp_size: int | None = None,
         prefix: str = "",
-        custom_routing_function: Optional[Callable] = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
         num_redundant_experts: int = 0,
         has_bias: bool = False,
         is_sequence_parallel=False,
-        zero_expert_num: Optional[int] = 0,
-        zero_expert_type: Optional[str] = None,
-        expert_mapping: Optional[list[tuple[str, str, int, str]]] = None,
+        zero_expert_num: int | None = 0,
+        zero_expert_type: str | None = None,
+        expert_mapping: list[tuple[str, str, int, str]] | None = None,
     ):
         super().__init__()
         if params_dtype is None:
@@ -1092,9 +1092,9 @@ class FusedMoE(CustomOp):
         self.layer_name = prefix
 
         self.enable_eplb = enable_eplb
-        self.expert_load_view: Optional[torch.Tensor] = None
-        self.logical_to_physical_map: Optional[torch.Tensor] = None
-        self.logical_replica_count: Optional[torch.Tensor] = None
+        self.expert_load_view: torch.Tensor | None = None
+        self.logical_to_physical_map: torch.Tensor | None = None
+        self.logical_replica_count: torch.Tensor | None = None
 
         # Determine expert maps
         if self.use_ep:
@@ -1128,7 +1128,7 @@ class FusedMoE(CustomOp):
                     )
                     expert_placement_strategy = "linear"
 
-            self.expert_map: Optional[torch.Tensor]
+            self.expert_map: torch.Tensor | None
             local_num_experts, expert_map = determine_expert_map(
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
@@ -1187,12 +1187,12 @@ class FusedMoE(CustomOp):
             has_bias=has_bias,
         )
         self.moe_config = moe
-        self.moe_quant_config: Optional[FusedMoEQuantConfig] = None
+        self.moe_quant_config: FusedMoEQuantConfig | None = None
         self.quant_config = quant_config
 
         # Note: get_quant_method will look at the layer's local_num_experts
         # for heuristic purposes, so it must be initialized first.
-        quant_method: Optional[QuantizeMethodBase] = None
+        quant_method: QuantizeMethodBase | None = None
         quant_method = (
             UnquantizedFusedMoEMethod(moe)
             if quant_config is None
@@ -1238,8 +1238,8 @@ class FusedMoE(CustomOp):
         self.quant_method.create_weights(layer=self, **moe_quant_params)
 
         # Chunked all2all staging tensor
-        self.batched_hidden_states: Optional[torch.Tensor] = None
-        self.batched_router_logits: Optional[torch.Tensor] = None
+        self.batched_hidden_states: torch.Tensor | None = None
+        self.batched_router_logits: torch.Tensor | None = None
 
         if self.use_dp_chunking:
             states_shape: tuple[int, ...]
@@ -1262,7 +1262,7 @@ class FusedMoE(CustomOp):
             )
 
     @property
-    def shared_experts(self) -> Optional[torch.nn.Module]:
+    def shared_experts(self) -> torch.nn.Module | None:
         return None
 
     @property
@@ -1534,7 +1534,7 @@ class FusedMoE(CustomOp):
         shard_id: str,
         expert_id: int,
         return_success: bool = False,
-    ) -> Optional[bool]:
+    ) -> bool | None:
         if self.quant_config and self.quant_config.get_name() == "mxfp4":
             # (FIXME) for gpt-oss all experts are combined
             if "bias" in weight_name:
@@ -1851,21 +1851,21 @@ class FusedMoE(CustomOp):
         top_k: int,
         use_grouped_topk: bool,
         renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        custom_routing_function: Optional[Callable] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        indices_type: Optional[torch.dtype] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
+        indices_type: torch.dtype | None = None,
         enable_eplb: bool = False,
-        expert_map: Optional[torch.Tensor] = None,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-        global_num_experts: Optional[int] = None,
-        zero_expert_num: Optional[int] = None,
-        zero_expert_type: Optional[str] = None,
+        expert_map: torch.Tensor | None = None,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+        global_num_experts: int | None = None,
+        zero_expert_num: int | None = None,
+        zero_expert_type: str | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Route the input hidden states to the top-k experts based on the
@@ -2006,7 +2006,7 @@ class FusedMoE(CustomOp):
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         og_hidden_states = hidden_states.shape[-1]
         if self.hidden_size != og_hidden_states:
             hidden_states = F.pad(
@@ -2047,14 +2047,14 @@ class FusedMoE(CustomOp):
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         return self.forward_native(hidden_states, router_logits)
 
     def forward_impl_chunked(
         self,
         full_hidden_states: torch.Tensor,
         full_router_logits: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.batched_hidden_states is not None
         assert self.batched_router_logits is not None
         assert self.batched_hidden_states.dtype == full_hidden_states.dtype
@@ -2200,7 +2200,7 @@ class FusedMoE(CustomOp):
         self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.quant_method is not None
 
         self.ensure_moe_quant_config()
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index b5602a112ef13..a0ed88309df0c 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum
 from math import prod
-from typing import Callable, Optional, Union, final
+from typing import final
 
 import torch
 
@@ -81,7 +82,7 @@ class ExpertTokensMetadata:
     """
 
     expert_num_tokens: torch.Tensor
-    expert_num_tokens_cpu: Optional[torch.Tensor]
+    expert_num_tokens_cpu: torch.Tensor | None
 
     @staticmethod
     def make_from_list(
@@ -104,7 +105,7 @@ class TopKWeightAndReduce(ABC):
     @abstractmethod
     def apply(
         self,
-        output: Optional[torch.Tensor],
+        output: torch.Tensor | None,
         fused_expert_output: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
@@ -132,10 +133,10 @@ class TopKWeightAndReduce(ABC):
 #
 PrepareResultType = tuple[
     torch.Tensor,
-    Optional[torch.Tensor],
-    Optional[ExpertTokensMetadata],
-    Optional[torch.Tensor],
-    Optional[torch.Tensor],
+    torch.Tensor | None,
+    ExpertTokensMetadata | None,
+    torch.Tensor | None,
+    torch.Tensor | None,
 ]
 
 ReceiverType = Callable[[], PrepareResultType]
@@ -155,7 +156,7 @@ class FusedMoEPrepareAndFinalize(ABC):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> PrepareResultType:
@@ -195,10 +196,10 @@ class FusedMoEPrepareAndFinalize(ABC):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-    ) -> Union[tuple[Callable, ReceiverType], ReceiverType]:
+    ) -> tuple[Callable, ReceiverType] | ReceiverType:
         """
         Perform any quantization (and/or) dispatching needed for this kernel
         but do not wait for results from other workers.
@@ -270,7 +271,7 @@ class FusedMoEPrepareAndFinalize(ABC):
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
         weight_and_reduce_impl: TopKWeightAndReduce,
-    ) -> Union[tuple[Callable, Callable], Callable]:
+    ) -> tuple[Callable, Callable] | Callable:
         """
         Perform any combine plus apply weights and perform a reduction on the
         fused experts output but do not wait for results from other workers.
@@ -314,7 +315,7 @@ class FusedMoEPrepareAndFinalize(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         """
         The PrepareFinalize All2All implementations generally constrain the
         dtype of the topk_ids they support. This function returns the
@@ -324,7 +325,7 @@ class FusedMoEPrepareAndFinalize(ABC):
         raise NotImplementedError
 
     @abstractmethod
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         """
         Some PrepareFinalize All2All implementations are batched. Meaning,
         they can process only as set of tokens at a time. This
@@ -423,11 +424,11 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
     #
 
     @property
-    def quant_dtype(self) -> Optional[torch.dtype]:
+    def quant_dtype(self) -> torch.dtype | None:
         return self.quant_config.quant_dtype
 
     @property
-    def block_shape(self) -> Optional[list[int]]:
+    def block_shape(self) -> list[int] | None:
         return self.quant_config.block_shape
 
     @property
@@ -439,51 +440,51 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
         return self.quant_config.per_out_ch_quant
 
     @property
-    def a1_scale(self) -> Optional[torch.Tensor]:
+    def a1_scale(self) -> torch.Tensor | None:
         return self.quant_config.a1_scale
 
     @property
-    def a2_scale(self) -> Optional[torch.Tensor]:
+    def a2_scale(self) -> torch.Tensor | None:
         return self.quant_config.a2_scale
 
     @property
-    def a1_gscale(self) -> Optional[torch.Tensor]:
+    def a1_gscale(self) -> torch.Tensor | None:
         return self.quant_config.a1_gscale
 
     @property
-    def a2_gscale(self) -> Optional[torch.Tensor]:
+    def a2_gscale(self) -> torch.Tensor | None:
         return self.quant_config.a2_gscale
 
     @property
-    def w1_scale(self) -> Optional[torch.Tensor]:
+    def w1_scale(self) -> torch.Tensor | None:
         return self.quant_config.w1_scale
 
     @property
-    def w2_scale(self) -> Optional[torch.Tensor]:
+    def w2_scale(self) -> torch.Tensor | None:
         return self.quant_config.w2_scale
 
     @property
-    def w1_zp(self) -> Optional[torch.Tensor]:
+    def w1_zp(self) -> torch.Tensor | None:
         return self.quant_config.w1_zp
 
     @property
-    def w2_zp(self) -> Optional[torch.Tensor]:
+    def w2_zp(self) -> torch.Tensor | None:
         return self.quant_config.w2_zp
 
     @property
-    def w1_bias(self) -> Optional[torch.Tensor]:
+    def w1_bias(self) -> torch.Tensor | None:
         return self.quant_config.w1_bias
 
     @property
-    def w2_bias(self) -> Optional[torch.Tensor]:
+    def w2_bias(self) -> torch.Tensor | None:
         return self.quant_config.w2_bias
 
     @property
-    def g1_alphas(self) -> Optional[torch.Tensor]:
+    def g1_alphas(self) -> torch.Tensor | None:
         return self.quant_config.g1_alphas
 
     @property
-    def g2_alphas(self) -> Optional[torch.Tensor]:
+    def g2_alphas(self) -> torch.Tensor | None:
         return self.quant_config.g2_alphas
 
     # TODO (bnell): make this return a CHUNK_SIZE or None instead?
@@ -517,7 +518,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[ExpertTokensMetadata],
+        expert_tokens_meta: ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         """
         Compute the shapes for the temporary and final outputs of the two gemms
@@ -578,12 +579,12 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[ExpertTokensMetadata],
+        expert_tokens_meta: ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ) -> None:
         """
@@ -625,8 +626,8 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
 
 
 def _slice_scales(
-    scales: Optional[torch.Tensor], start: int, end: int
-) -> Optional[torch.Tensor]:
+    scales: torch.Tensor | None, start: int, end: int
+) -> torch.Tensor | None:
     if scales is not None:
         if scales.numel() == 1:
             return scales
@@ -688,7 +689,7 @@ class FusedMoEModularKernel(torch.nn.Module):
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
         fused_experts: FusedMoEPermuteExpertsUnpermute,
-        shared_experts: Optional[torch.nn.Module] = None,
+        shared_experts: torch.nn.Module | None = None,
     ):
         super().__init__()
         self.prepare_finalize = prepare_finalize
@@ -741,7 +742,7 @@ class FusedMoEModularKernel(torch.nn.Module):
         top_k: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[ExpertTokensMetadata],
+        expert_tokens_meta: ExpertTokensMetadata | None,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Allocate temporary and output buffers for the fused experts op.
@@ -825,11 +826,11 @@ class FusedMoEModularKernel(torch.nn.Module):
     @staticmethod
     def _slice_expert_tokens_metadata(
         num_chunks: int,
-        full_expert_tokens_meta: Optional[ExpertTokensMetadata],
+        full_expert_tokens_meta: ExpertTokensMetadata | None,
         chunk_topk_ids: torch.Tensor,
         local_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-    ) -> Optional[ExpertTokensMetadata]:
+        expert_map: torch.Tensor | None,
+    ) -> ExpertTokensMetadata | None:
         if num_chunks == 1 or full_expert_tokens_meta is None:
             return full_expert_tokens_meta
 
@@ -861,12 +862,12 @@ class FusedMoEModularKernel(torch.nn.Module):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
     ) -> tuple[
         torch.Tensor,
-        Optional[torch.Tensor],
-        Optional[ExpertTokensMetadata],
+        torch.Tensor | None,
+        ExpertTokensMetadata | None,
         torch.Tensor,
         torch.Tensor,
     ]:
@@ -945,7 +946,7 @@ class FusedMoEModularKernel(torch.nn.Module):
         self,
         in_dtype: torch.dtype,
         a1q: torch.Tensor,
-        a1q_scale: Optional[torch.Tensor],
+        a1q_scale: torch.Tensor | None,
         w1: torch.Tensor,
         w2: torch.Tensor,
         topk_weights: torch.Tensor,
@@ -953,9 +954,9 @@ class FusedMoEModularKernel(torch.nn.Module):
         activation: str,
         global_num_experts: int,
         local_num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
-        expert_tokens_meta: Optional[ExpertTokensMetadata],
+        expert_tokens_meta: ExpertTokensMetadata | None,
     ) -> torch.Tensor:
         _, M_full, N, K, top_k = self.fused_experts.moe_problem_size(
             a1q, w1, w2, topk_ids
@@ -1042,12 +1043,12 @@ class FusedMoEModularKernel(torch.nn.Module):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """
         The _finalize method is a wrapper around self.prepare_finalize.finalize
         that handles DBO, async and shared expert overlap.
         """
-        shared_output: Optional[torch.Tensor] = None
+        shared_output: torch.Tensor | None = None
 
         if not self.prepare_finalize.supports_async():
             assert not dbo_enabled()
@@ -1112,9 +1113,9 @@ class FusedMoEModularKernel(torch.nn.Module):
         inplace: bool = False,
         activation: str = "silu",
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
+        expert_map: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """
         This function computes a Mixture of Experts (MoE) layer using two sets
         of weights, w1 and w2, and top-k gating mechanism.
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
index 9994088ca5d9a..a0d14bdf607e7 100644
--- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -13,7 +12,7 @@ def moe_align_block_size(
     topk_ids: torch.Tensor,
     block_size: int,
     num_experts: int,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     pad_sorted_ids: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index 698080f8aec6f..9dcdcc3800363 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -13,14 +12,12 @@ from vllm.model_executor.layers.fused_moe.utils import _fp8_perm
 
 def _moe_permute(
     curr_hidden_states: torch.Tensor,
-    a1q_scale: Optional[torch.Tensor],
+    a1q_scale: torch.Tensor | None,
     curr_topk_ids: torch.Tensor,
     global_num_experts: int,
-    expert_map: Optional[torch.Tensor],
+    expert_map: torch.Tensor | None,
     block_m: int,
-) -> tuple[
-    torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor
-]:
+) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Determine the sorted_token_ids, expert_ids for the given problem size.
     Permute the hidden states and scales according to `sorted_token_ids`.
@@ -33,7 +30,7 @@ def _moe_permute(
         curr_topk_ids, block_m, global_num_experts, expert_map, pad_sorted_ids=True
     )
 
-    inv_perm: Optional[torch.Tensor] = None
+    inv_perm: torch.Tensor | None = None
 
     num_tokens = top_k_num * tokens_in_chunk
     expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
@@ -53,7 +50,7 @@ def _moe_permute(
 def _moe_unpermute_and_reduce(
     out: torch.Tensor,
     curr_hidden: torch.Tensor,
-    inv_perm: Optional[torch.Tensor],
+    inv_perm: torch.Tensor | None,
     topk_weight: torch.Tensor,
     apply_router_weight_on_input: bool,
 ) -> None:
@@ -73,17 +70,15 @@ def _moe_unpermute_and_reduce(
 
 def moe_permute(
     hidden_states: torch.Tensor,
-    a1q_scale: Optional[torch.Tensor],
+    a1q_scale: torch.Tensor | None,
     topk_ids: torch.Tensor,
     n_expert: int,
     n_local_expert: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    align_block_size: Optional[int] = None,
+    expert_map: torch.Tensor | None = None,
+    align_block_size: int | None = None,
     fill_invalid_expert: int = -1,
-    permuted_hidden_states: Optional[torch.Tensor] = None,
-) -> tuple[
-    torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor
-]:
+    permuted_hidden_states: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     This function expands and permutes activation to gather uncontinuous tokens
       for each expert.
@@ -198,7 +193,7 @@ def moe_unpermute(
     permuted_hidden_states: torch.Tensor,
     topk_weights: torch.Tensor,
     inv_permuted_idx: torch.Tensor,
-    expert_first_token_offset: Optional[torch.Tensor] = None,
+    expert_first_token_offset: torch.Tensor | None = None,
 ) -> None:
     """
     This function expands and permutes activation to gathering uncontinuous
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index e87953e34eaf2..0e77fa54cd508 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import pplx_kernels as pplx
 import torch
@@ -24,9 +24,9 @@ def pplx_hidden_dim_scale_bytes(
     max_num_tokens: int,
     hidden_dim: int,
     in_dtype: torch.dtype,
-    quant_dtype: Union[torch.dtype, str, None],
+    quant_dtype: torch.dtype | str | None,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
 ):
     # All pplx byte sizes must be 16-byte aligned.
     align = 16
@@ -82,10 +82,10 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.BatchedExperts
 
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         return self.max_num_tokens
 
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         return torch.uint32
 
     def num_dispatchers(self) -> int:
@@ -103,7 +103,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> tuple[Callable, mk.ReceiverType]:
@@ -148,7 +148,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             a1q, a1q_scale, quant_config.per_act_token_quant, quant_config.block_shape
         )
 
-        orig_a_scale_block_shape: Optional[int] = None
+        orig_a_scale_block_shape: int | None = None
 
         if a1q_scale is not None:
             scalar_scales = a1q_scale.numel() == 1
@@ -184,7 +184,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             device=device,
         )
 
-        expert_x_scale: Optional[torch.Tensor] = None
+        expert_x_scale: torch.Tensor | None = None
         if a1q.dtype.itemsize == 1:
             if quant_config.is_per_act_token:
                 # (M x 1) -> (E x M x K)
@@ -212,7 +212,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         # This argument is optional, defaults to indices.size(0)
         # There's not much point setting this unless it is != indices.size(0)
-        bound_m: Optional[torch.Tensor] = None
+        bound_m: torch.Tensor | None = None
 
         self.a2a.dispatch(
             out_expert_num_tokens=expert_num_tokens,
@@ -252,8 +252,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         self,
         expert_num_tokens: torch.Tensor,
         expert_x: torch.Tensor,
-        expert_x_scale: Optional[torch.Tensor],
-        orig_a_scale_block_shape: Optional[int],
+        expert_x_scale: torch.Tensor | None,
+        orig_a_scale_block_shape: int | None,
     ) -> mk.PrepareResultType:
         if expert_x_scale is not None:
             expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
@@ -271,7 +271,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
@@ -302,7 +302,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
         # This argument is optional
         # There's not much point setting this unless it is != topk_ids.size(0)
-        bound_m: Optional[torch.Tensor] = None
+        bound_m: torch.Tensor | None = None
 
         # TODO (bnell): fails in test_pplx_moe.py, figure out what's going on
         # num_tokens = output.size(0)  # M
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 1e572d2394781..9bb976fb9ec93 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -18,10 +17,10 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
     def activation_format(self) -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
 
-    def max_num_tokens_per_rank(self) -> Optional[int]:
+    def max_num_tokens_per_rank(self) -> int | None:
         return None
 
-    def topk_indices_dtype(self) -> Optional[torch.dtype]:
+    def topk_indices_dtype(self) -> torch.dtype | None:
         return None
 
     def num_dispatchers(self) -> int:
@@ -36,7 +35,7 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize):
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         num_experts: int,
-        expert_map: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 801785b18fb9e..921e0b24b9efb 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from enum import IntEnum
 from functools import cache
-from typing import Optional
 
 import torch
 
@@ -53,13 +52,13 @@ def rocm_aiter_asm_moe_tkw1_impl(
     w2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
-    fc1_scale: Optional[torch.Tensor] = None,
-    fc2_scale: Optional[torch.Tensor] = None,
-    fc1_smooth_scale: Optional[torch.Tensor] = None,
-    fc2_smooth_scale: Optional[torch.Tensor] = None,
+    fc1_scale: torch.Tensor | None = None,
+    fc2_scale: torch.Tensor | None = None,
+    fc1_smooth_scale: torch.Tensor | None = None,
+    fc2_smooth_scale: torch.Tensor | None = None,
     a16: bool = False,
-    per_tensor_quant_scale: Optional[torch.Tensor] = None,
-    expert_mask: Optional[torch.Tensor] = None,
+    per_tensor_quant_scale: torch.Tensor | None = None,
+    expert_mask: torch.Tensor | None = None,
     activation_method: int = ActivationMethod.SILU.value,
 ) -> torch.Tensor:
     from aiter import ActivationType
@@ -90,13 +89,13 @@ def rocm_aiter_asm_moe_tkw1_fake(
     w2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
-    fc1_scale: Optional[torch.Tensor] = None,
-    fc2_scale: Optional[torch.Tensor] = None,
-    fc1_smooth_scale: Optional[torch.Tensor] = None,
-    fc2_smooth_scale: Optional[torch.Tensor] = None,
+    fc1_scale: torch.Tensor | None = None,
+    fc2_scale: torch.Tensor | None = None,
+    fc1_smooth_scale: torch.Tensor | None = None,
+    fc2_smooth_scale: torch.Tensor | None = None,
     a16: bool = False,
-    per_tensor_quant_scale: Optional[torch.Tensor] = None,
-    expert_mask: Optional[torch.Tensor] = None,
+    per_tensor_quant_scale: torch.Tensor | None = None,
+    expert_mask: torch.Tensor | None = None,
     activation_method: int = ActivationMethod.SILU.value,
 ) -> torch.Tensor:
     return torch.empty_like(hidden_states)
@@ -206,14 +205,14 @@ def rocm_aiter_fused_moe_impl(
     w2: torch.Tensor,
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
-    expert_mask: Optional[torch.Tensor] = None,
+    expert_mask: torch.Tensor | None = None,
     activation_method: int = ActivationMethod.SILU.value,
     quant_method: int = QuantMethod.NO.value,
     doweight_stage1: bool = False,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
 ) -> torch.Tensor:
     from aiter import ActivationType, QuantType
     from aiter.fused_moe import fused_moe
@@ -244,14 +243,14 @@ def rocm_aiter_fused_moe_fake(
     w2: torch.Tensor,
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
-    expert_mask: Optional[torch.Tensor] = None,
+    expert_mask: torch.Tensor | None = None,
     activation_method: int = ActivationMethod.SILU.value,
     quant_method: int = QuantMethod.NO.value,
     doweight_stage1: bool = False,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
+    w1_scale: torch.Tensor | None = None,
+    w2_scale: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
 ) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
@@ -300,7 +299,7 @@ def rocm_aiter_grouped_topk(
     topk_group: int = 0,
     scoring_func: str = "softmax",
     routed_scaling_factor: float = 1.0,
-    e_score_correction_bias: Optional[torch.Tensor] = None,
+    e_score_correction_bias: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     token = hidden_states.shape[0]
     device = hidden_states.device
@@ -342,8 +341,8 @@ def rocm_aiter_fused_experts(
     topk_ids: torch.Tensor,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
-    expert_map: Optional[torch.Tensor] = None,
-    quant_config: Optional[FusedMoEQuantConfig] = None,
+    expert_map: torch.Tensor | None = None,
+    quant_config: FusedMoEQuantConfig | None = None,
 ) -> torch.Tensor:
     if quant_config is None:
         quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
diff --git a/vllm/model_executor/layers/fused_moe/routing_simulator.py b/vllm/model_executor/layers/fused_moe/routing_simulator.py
index af20f4b7c1d2b..8b04cf4539e04 100644
--- a/vllm/model_executor/layers/fused_moe/routing_simulator.py
+++ b/vllm/model_executor/layers/fused_moe/routing_simulator.py
@@ -10,7 +10,7 @@ like uniform random routing.
 """
 
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
@@ -24,7 +24,7 @@ class RoutingStrategy(ABC):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        indices_type: Optional[torch.dtype] = None,
+        indices_type: torch.dtype | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Route tokens to experts.
@@ -89,7 +89,7 @@ class DistributionBasedRouting(RoutingStrategy):
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
-        indices_type: Optional[torch.dtype] = None,
+        indices_type: torch.dtype | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Randomly select experts for each token using the specified distribution.
@@ -269,7 +269,7 @@ class RoutingSimulator:
         router_logits: torch.Tensor,
         strategy_name: str,
         top_k: int,
-        indices_type: Optional[torch.dtype] = None,
+        indices_type: torch.dtype | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Simulate token-to-expert routing using the specified strategy.
diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
index a678fdae8833e..ecf11dd586a05 100644
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -18,7 +17,7 @@ class SharedFusedMoE(FusedMoE):
 
     def __init__(
         self,
-        shared_experts: Optional[torch.nn.Module],
+        shared_experts: torch.nn.Module | None,
         use_overlapped: bool = True,
         **kwargs,
     ):
@@ -35,7 +34,7 @@ class SharedFusedMoE(FusedMoE):
         )
 
     @property
-    def shared_experts(self) -> Optional[torch.nn.Module]:
+    def shared_experts(self) -> torch.nn.Module | None:
         return self._shared_experts if self.use_overlapped else None
 
     def forward(
diff --git a/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py b/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
index e725a0f00363e..99d4038ec3813 100644
--- a/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
+++ b/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -29,7 +28,7 @@ class TopKWeightAndReduceDelegate(mk.TopKWeightAndReduce):
 
     def apply(
         self,
-        output: Optional[torch.Tensor],
+        output: torch.Tensor | None,
         fused_expert_output: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
@@ -52,7 +51,7 @@ class TopKWeightAndReduceNoOP(mk.TopKWeightAndReduce):
 
     def apply(
         self,
-        output: Optional[torch.Tensor],
+        output: torch.Tensor | None,
         fused_expert_output: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
@@ -84,7 +83,7 @@ class TopKWeightAndReduceContiguous(mk.TopKWeightAndReduce):
 
     def apply(
         self,
-        output: Optional[torch.Tensor],
+        output: torch.Tensor | None,
         fused_expert_output: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
@@ -133,7 +132,7 @@ class TopKWeightAndReduceNaiveBatched(mk.TopKWeightAndReduce):
 
     def apply(
         self,
-        output: Optional[torch.Tensor],
+        output: torch.Tensor | None,
         fused_expert_output: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 94a3ba74e47fd..908b1806acc0c 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -89,7 +88,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
@@ -128,12 +127,12 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         use_deep_gemm = self.allow_deep_gemm and (
diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
index c84d1afeb1f97..0b0048c6455ec 100644
--- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -58,7 +57,7 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk: int,
         global_num_experts: int,
         local_num_experts: int,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # The workspaces for this implementation are managed by flashinfer.
         workspace1 = (0,)
@@ -100,12 +99,12 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
         topk_ids: torch.Tensor,
         activation: str,
         global_num_experts: int,
-        expert_map: Optional[torch.Tensor],
-        a1q_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor],
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
-        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool,
     ):
         topk = topk_ids.size(-1)
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index bd68d2ec884de..a682f848b0c4f 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from math import prod
-from typing import Optional, Union
 
 import torch
 
@@ -60,7 +59,7 @@ def _count_expert_num_tokens(
 
 
 def count_expert_num_tokens(
-    topk_ids: torch.Tensor, num_local_experts: int, expert_map: Optional[torch.Tensor]
+    topk_ids: torch.Tensor, num_local_experts: int, expert_map: torch.Tensor | None
 ) -> torch.Tensor:
     """
     Count the number to tokens assigned to each expert.
@@ -112,7 +111,7 @@ def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor:
 
 def _nvfp4_quantize(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     is_sf_swizzled_layout: bool,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     return flashinfer_fp4_quantize(
@@ -122,9 +121,9 @@ def _nvfp4_quantize(
 
 def _fp8_quantize(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     per_act_token: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Perform fp8 quantization on the inputs.  If a block_shape
@@ -148,9 +147,9 @@ def _fp8_quantize(
 
 def _int8_quantize(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     per_act_token: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Perform int8 quantization on the inputs.  If a block_shape
@@ -175,9 +174,9 @@ def _int8_quantize(
 
 def _mxfp4_quantize(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> tuple[torch.Tensor, None]:
     assert block_shape is None
     # TODO: native mxfp4 is currently not integrated in vllm,
@@ -191,9 +190,9 @@ def _mxfp4_quantize(
 
 def _mxfp8_e4m3_quantize(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert A_scale is None
     assert not per_act_token_quant
@@ -203,9 +202,9 @@ def _mxfp8_e4m3_quantize(
 
 def _mxfp6_e3m2_quantize(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> tuple[torch.Tensor, None]:
     assert block_shape is None
 
@@ -220,9 +219,9 @@ def _mxfp6_e3m2_quantize(
 
 def _mxfp6_e2m3_quantize(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
 ) -> tuple[torch.Tensor, None]:
     assert block_shape is None
 
@@ -237,12 +236,12 @@ def _mxfp6_e2m3_quantize(
 
 def moe_kernel_quantize_input(
     A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
-    quant_dtype: Union[None, torch.dtype, str],
+    A_scale: torch.Tensor | None,
+    quant_dtype: None | torch.dtype | str,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]] = None,
+    block_shape: list[int] | None = None,
     is_fp4_scale_swizzled: bool = True,
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor | None]:
     if quant_dtype == torch.float8_e4m3fn:
         return _fp8_quantize(A, A_scale, per_act_token_quant, block_shape)
     elif quant_dtype == torch.int8:
@@ -273,7 +272,7 @@ def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
         return m[idx, ...]
 
 
-def normalize_scales_shape(scales: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+def normalize_scales_shape(scales: torch.Tensor | None) -> torch.Tensor | None:
     if scales is not None:
         if scales.numel() == 1:
             scales = scales.view(1, 1)
@@ -283,9 +282,9 @@ def normalize_scales_shape(scales: Optional[torch.Tensor]) -> Optional[torch.Ten
 
 
 def normalize_batched_scales_shape(
-    scales: Optional[torch.Tensor],
+    scales: torch.Tensor | None,
     num_experts: int,
-) -> Optional[torch.Tensor]:
+) -> torch.Tensor | None:
     if scales is not None and scales.ndim < 3:
         if scales.numel() == 1:
             scales = scales.view(1)
@@ -300,9 +299,9 @@ def normalize_batched_scales_shape(
 
 def _validate_scale_shape(
     a: torch.Tensor,
-    a_scale: Optional[torch.Tensor],
+    a_scale: torch.Tensor | None,
     per_act_token_quant: bool,
-    block_shape: Optional[list[int]],
+    block_shape: list[int] | None,
 ) -> None:
     if a_scale is None:
         return
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 910f145b1f8c2..135fbda2d540f 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom normalization layers."""
 
-from typing import Optional, Union
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -159,9 +157,9 @@ class RMSNorm(CustomOp):
         self,
         hidden_size: int,
         eps: float = 1e-6,
-        var_hidden_size: Optional[int] = None,
+        var_hidden_size: int | None = None,
         has_weight: bool = True,
-        dtype: Optional[torch.dtype] = None,
+        dtype: torch.dtype | None = None,
     ) -> None:
         super().__init__()
 
@@ -190,8 +188,8 @@ class RMSNorm(CustomOp):
     def forward_native(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
         orig_dtype = x.dtype
         x = x.to(torch.float32)
@@ -231,8 +229,8 @@ class RMSNorm(CustomOp):
     def forward_cuda(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if self.variance_size_override is not None:
             return self.forward_native(x, residual)
 
@@ -247,8 +245,8 @@ class RMSNorm(CustomOp):
     def forward_hip(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if self.variance_size_override is not None:
             return self.forward_native(x, residual)
 
@@ -263,8 +261,8 @@ class RMSNorm(CustomOp):
     def forward_xpu(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if self.variance_size_override is not None:
             return self.forward_native(x, residual)
 
@@ -313,8 +311,8 @@ class GemmaRMSNorm(CustomOp):
         weight: torch.Tensor,
         variance_epsilon: float,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
         orig_dtype = x.dtype
         if residual is not None:
@@ -337,16 +335,16 @@ class GemmaRMSNorm(CustomOp):
     def forward_native(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
         return self.forward_static(self.weight.data, self.variance_epsilon, x, residual)
 
     def forward_cuda(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if torch.compiler.is_compiling():
             return self.forward_native(x, residual)
 
diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
index e874301b02c05..99853680eac6c 100644
--- a/vllm/model_executor/layers/lightning_attn.py
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 from einops import rearrange
@@ -529,7 +528,7 @@ def lightning_attention(
     v: torch.Tensor,
     ed: torch.Tensor,
     block_size: int = 256,
-    kv_history: Optional[torch.Tensor] = None,
+    kv_history: torch.Tensor | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Apply lightning attention algorithm
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 63358a0c07d89..34bfcabc69a55 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -3,7 +3,7 @@
 
 import itertools
 from abc import abstractmethod
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch.nn.parameter import Parameter, UninitializedParameter
@@ -187,7 +187,7 @@ class LinearMethodBase(QuantizeMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Apply the weights in layer to the input tensor.
         Expects create_weights to have been called before on the layer."""
@@ -252,7 +252,7 @@ class UnquantizedLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
 
@@ -276,8 +276,8 @@ class LinearBase(CustomOp):
         input_size: int,
         output_size: int,
         skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         *,
         return_bias: bool = True,
@@ -295,7 +295,7 @@ class LinearBase(CustomOp):
         self.quant_config = quant_config
         self.prefix = prefix
         if quant_config is None:
-            self.quant_method: Optional[QuantizeMethodBase] = UnquantizedLinearMethod()
+            self.quant_method: QuantizeMethodBase | None = UnquantizedLinearMethod()
         else:
             self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
         self.return_bias = return_bias
@@ -333,8 +333,8 @@ class ReplicatedLinear(LinearBase):
         output_size: int,
         bias: bool = True,
         skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         *,
         return_bias: bool = True,
@@ -409,7 +409,7 @@ class ReplicatedLinear(LinearBase):
     def forward(
         self,
         x: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
         bias = self.bias if not self.skip_bias_add else None
         assert self.quant_method is not None
 
@@ -461,9 +461,9 @@ class ColumnParallelLinear(LinearBase):
         bias: bool = True,
         gather_output: bool = False,
         skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        output_sizes: Optional[list[int]] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        output_sizes: list[int] | None = None,
         prefix: str = "",
         *,
         return_bias: bool = True,
@@ -574,7 +574,7 @@ class ColumnParallelLinear(LinearBase):
     def forward(
         self,
         input_,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
         bias = self.bias if not self.skip_bias_add else None
 
         # Matrix multiply.
@@ -633,8 +633,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         bias: bool = True,
         gather_output: bool = False,
         skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         *,
         return_bias: bool = True,
@@ -662,7 +662,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         self,
         param: Parameter,
         loaded_weight: torch.Tensor,
-        loaded_shard_id: Optional[int] = None,
+        loaded_shard_id: int | None = None,
     ):
         # Special case for GGUF
         # initialize GGUF param after we know the quantize type
@@ -838,7 +838,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         self,
         param: BasevLLMParameter,
         loaded_weight: torch.Tensor,
-        loaded_shard_id: Optional[int] = None,
+        loaded_shard_id: int | None = None,
     ):
         if loaded_shard_id is None:
             if isinstance(param, PerTensorScaleParameter):
@@ -914,11 +914,11 @@ class QKVParallelLinear(ColumnParallelLinear):
         hidden_size: int,
         head_size: int,
         total_num_heads: int,
-        total_num_kv_heads: Optional[int] = None,
+        total_num_kv_heads: int | None = None,
         bias: bool = True,
         skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         *,
         return_bias: bool = True,
@@ -1027,7 +1027,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         self,
         param: BasevLLMParameter,
         loaded_weight: torch.Tensor,
-        loaded_shard_id: Optional[str] = None,
+        loaded_shard_id: str | None = None,
     ):
         if loaded_shard_id is None:  # special case for certain models
             if isinstance(param, PerTensorScaleParameter):
@@ -1071,7 +1071,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         self,
         param: Parameter,
         loaded_weight: torch.Tensor,
-        loaded_shard_id: Optional[str] = None,
+        loaded_shard_id: str | None = None,
     ):
         # Special case for GGUF
         # initialize GGUF param after we know the quantize type
@@ -1296,9 +1296,9 @@ class RowParallelLinear(LinearBase):
         bias: bool = True,
         input_is_parallel: bool = True,
         skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
+        params_dtype: torch.dtype | None = None,
         reduce_results: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         *,
         return_bias: bool = True,
@@ -1405,7 +1405,7 @@ class RowParallelLinear(LinearBase):
     def forward(
         self,
         input_,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
         if self.input_is_parallel:
             input_parallel = input_
         else:
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 3db5e0b325538..c8d57f597d1ca 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that compute logits from hidden_stats."""
 
-from typing import Optional
-
 import torch
 
 from vllm.distributed import (
@@ -28,10 +26,10 @@ class LogitsProcessor(CustomOp):
     def __init__(
         self,
         vocab_size: int,
-        org_vocab_size: Optional[int] = None,
+        org_vocab_size: int | None = None,
         scale: float = 1.0,
         logits_as_input: bool = False,
-        soft_cap: Optional[float] = None,
+        soft_cap: float | None = None,
     ) -> None:
         """
         Args:
@@ -53,8 +51,8 @@ class LogitsProcessor(CustomOp):
         self,
         lm_head: VocabParallelEmbedding,
         hidden_states: torch.Tensor,
-        embedding_bias: Optional[torch.Tensor] = None,
-    ) -> Optional[torch.Tensor]:
+        embedding_bias: torch.Tensor | None = None,
+    ) -> torch.Tensor | None:
         if self.logits_as_input:
             logits = hidden_states
         else:
@@ -88,8 +86,8 @@ class LogitsProcessor(CustomOp):
         self,
         hidden_states: torch.Tensor,
         lm_head: VocabParallelEmbedding,
-        embedding_bias: Optional[torch.Tensor],
-    ) -> Optional[torch.Tensor]:
+        embedding_bias: torch.Tensor | None,
+    ) -> torch.Tensor | None:
         # Get the logits for the next tokens.
         logits = lm_head.quant_method.apply(lm_head, hidden_states, bias=embedding_bias)
 
diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
index 99f05e2eca0e8..b5a37b2582e56 100644
--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -87,8 +87,8 @@ class MiniMaxText01RMSNormTP(CustomOp):
     def forward(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        residual: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert residual is None, "RMSNorm does not support residual connection."
         return self._forward(x)
 
@@ -102,7 +102,7 @@ class MiniMaxText01LinearKernel:
         kv_caches: torch.Tensor,
         slope_rate: torch.Tensor,
         block_size: int,
-        layer_idx: Optional[int] = None,
+        layer_idx: int | None = None,
         **kwargs,
     ) -> torch.Tensor:
         slope_rate = slope_rate.to(torch.float32)
@@ -154,9 +154,9 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
         max_position: int,
         block_size: int,
         num_hidden_layer: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         layer_idx: int = 0,
         linear_layer_idx: int = 0,
         prefix: str = "linear_attn",
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 8ab77965ae80a..8f7317556f776 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, NamedTuple, Optional
+from typing import TYPE_CHECKING, NamedTuple
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -68,8 +68,8 @@ class MambaMixer(MambaBase, CustomOp):
         rms_norm_eps: float = 1e-5,
         activation="silu",
         is_lora_enabled: bool = False,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -410,7 +410,7 @@ class MambaMixer(MambaBase, CustomOp):
 
         return Mamba1AttentionBackend
 
-    def _time_proj_bias(self) -> Optional[torch.Tensor]:
+    def _time_proj_bias(self) -> torch.Tensor | None:
         if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None:
             return self.dt_proj.bias.float()
         return None
@@ -423,8 +423,8 @@ class PrefillDecodeSplit(NamedTuple):
     gate_d: torch.Tensor
     state_indices_tensor_p: torch.Tensor
     state_indices_tensor_d: torch.Tensor
-    query_start_loc_p: Optional[torch.Tensor]
-    has_initial_states_p: Optional[torch.Tensor]
+    query_start_loc_p: torch.Tensor | None
+    has_initial_states_p: torch.Tensor | None
 
 
 def split_batch_to_prefill_and_decode(
@@ -432,7 +432,7 @@ def split_batch_to_prefill_and_decode(
     gate: torch.Tensor,
     state_indices_tensor: torch.Tensor,
     query_start_loc: torch.Tensor,
-    has_initial_states: Optional[torch.Tensor],
+    has_initial_states: torch.Tensor | None,
     num_prefill_tokens: int,
     num_decode_tokens: int,
     num_prefills: int,
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 7589905ac9277..b0ee327a82347 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -138,7 +138,7 @@ class Mixer2RMSNormGated(CustomOp):
         self,
         x: torch.Tensor,
         gate: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         input_dtype = x.dtype
         if not self.use_rms_norm:
             # Keep gate in float32 for numerical stability during silu
@@ -244,9 +244,9 @@ class MambaMixer2(MambaBase, CustomOp):
         rms_norm_eps: float = 1e-5,
         activation: str = "silu",
         use_rms_norm: bool = True,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -474,7 +474,7 @@ class MambaMixer2(MambaBase, CustomOp):
         self,
         hidden_states: torch.Tensor,
         output: torch.Tensor,
-        mup_vector: Optional[torch.Tensor] = None,
+        mup_vector: torch.Tensor | None = None,
     ):
         pass
 
@@ -482,7 +482,7 @@ class MambaMixer2(MambaBase, CustomOp):
         self,
         hidden_states: torch.Tensor,
         output: torch.Tensor,
-        mup_vector: Optional[torch.Tensor] = None,
+        mup_vector: torch.Tensor | None = None,
     ):
         torch.ops.vllm.mamba_mixer2(
             hidden_states,
@@ -495,7 +495,7 @@ class MambaMixer2(MambaBase, CustomOp):
         self,
         hidden_states: torch.Tensor,
         output: torch.Tensor,
-        mup_vector: Optional[torch.Tensor] = None,
+        mup_vector: torch.Tensor | None = None,
     ):
         forward_context = get_forward_context()
         # attn_metadata contains metadata necessary for the mamba2 triton
@@ -904,7 +904,7 @@ def mamba_mixer2(
     hidden_states: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    mup_vector: Optional[torch.Tensor] = None,
+    mup_vector: torch.Tensor | None = None,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
@@ -915,7 +915,7 @@ def mamba_mixer2_fake(
     hidden_states: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    mup_vector: Optional[torch.Tensor] = None,
+    mup_vector: torch.Tensor | None = None,
 ) -> None:
     return
 
diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py
index 0f160b2c924fb..41ab7f3fecdbc 100644
--- a/vllm/model_executor/layers/mamba/mamba_utils.py
+++ b/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Union
 
 import torch
 
@@ -14,7 +13,7 @@ class MambaStateDtypeCalculator:
     @classmethod
     def linear_attention_state_dtype(
         cls,
-        model_dtype: Union[ModelDType, torch.dtype],
+        model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
         # TODO (tdoublep) requires testing
@@ -26,7 +25,7 @@ class MambaStateDtypeCalculator:
     @classmethod
     def mamba1_state_dtype(
         cls,
-        model_dtype: Union[ModelDType, torch.dtype],
+        model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
         mamba_ssm_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
@@ -37,7 +36,7 @@ class MambaStateDtypeCalculator:
     @classmethod
     def mamba2_state_dtype(
         cls,
-        model_dtype: Union[ModelDType, torch.dtype],
+        model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
         mamba_ssm_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
@@ -48,7 +47,7 @@ class MambaStateDtypeCalculator:
     @classmethod
     def _mamba_state_dtype(
         cls,
-        model_dtype: Union[ModelDType, torch.dtype],
+        model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
         mamba_ssm_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
@@ -63,7 +62,7 @@ class MambaStateDtypeCalculator:
     @classmethod
     def short_conv_state_dtype(
         cls,
-        model_dtype: Union[ModelDType, torch.dtype],
+        model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, ...]:
         conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
@@ -72,7 +71,7 @@ class MambaStateDtypeCalculator:
     @classmethod
     def gated_delta_net_state_dtype(
         cls,
-        model_dtype: Union[ModelDType, torch.dtype],
+        model_dtype: ModelDType | torch.dtype,
         mamba_cache_dtype: MambaDType,
     ) -> tuple[torch.dtype, torch.dtype]:
         state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
index ec486d3b92678..83c2c5f11e187 100644
--- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
@@ -4,7 +4,6 @@
 # Copyright (c) 2024, Tri Dao.
 # Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
 
-from typing import Optional, Union
 
 import numpy as np
 import torch
@@ -469,17 +468,17 @@ def _causal_conv1d_fwd_kernel(  # continuous batching
 def causal_conv1d_fn(
     x: torch.Tensor,
     weight: torch.Tensor,
-    bias: Union[torch.Tensor, None],
+    bias: torch.Tensor | None,
     conv_states: torch.Tensor,
     query_start_loc: torch.Tensor,
-    cache_indices: Optional[torch.Tensor] = None,
-    has_initial_state: Optional[torch.Tensor] = None,
-    activation: Optional[str] = "silu",
+    cache_indices: torch.Tensor | None = None,
+    has_initial_state: torch.Tensor | None = None,
+    activation: str | None = "silu",
     pad_slot_id: int = PAD_SLOT_ID,
-    block_idx_first_scheduled_token: Optional[torch.Tensor] = None,
-    block_idx_last_scheduled_token: Optional[torch.Tensor] = None,
-    initial_state_idx: Optional[torch.Tensor] = None,
-    num_computed_tokens: Optional[torch.Tensor] = None,
+    block_idx_first_scheduled_token: torch.Tensor | None = None,
+    block_idx_last_scheduled_token: torch.Tensor | None = None,
+    initial_state_idx: torch.Tensor | None = None,
+    num_computed_tokens: torch.Tensor | None = None,
     block_size_to_align=0,
     metadata=None,
     validate_data=False,
@@ -1071,15 +1070,15 @@ def causal_conv1d_update(
     x: torch.Tensor,
     conv_state: torch.Tensor,
     weight: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
-    activation: Union[bool, str, None] = None,
-    conv_state_indices: Optional[torch.Tensor] = None,
-    num_accepted_tokens: Optional[torch.Tensor] = None,
-    query_start_loc: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
+    activation: bool | str | None = None,
+    conv_state_indices: torch.Tensor | None = None,
+    num_accepted_tokens: torch.Tensor | None = None,
+    query_start_loc: torch.Tensor | None = None,
     max_query_len: int = -1,
     pad_slot_id: int = PAD_SLOT_ID,
-    block_idx_last_scheduled_token: Optional[torch.Tensor] = None,
-    initial_state_idx: Optional[torch.Tensor] = None,
+    block_idx_last_scheduled_token: torch.Tensor | None = None,
+    initial_state_idx: torch.Tensor | None = None,
     validate_data=False,
 ):
     """
diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py
index 32273d137eca2..afaa706929a2c 100644
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -38,8 +38,8 @@ class ShortConv(MambaBase, CustomOp):
         config,
         dim: int,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
index 4b397a058dcd8..4c81162d7d2b9 100644
--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -19,14 +18,14 @@ class MLAModules:
     kv_b_proj: torch.nn.Module
     rotary_emb: torch.nn.Module
     o_proj: torch.nn.Module
-    fused_qkv_a_proj: Optional[torch.nn.Module]
-    kv_a_proj_with_mqa: Optional[torch.nn.Module]
-    q_a_layernorm: Optional[torch.nn.Module]
-    q_b_proj: Optional[torch.nn.Module]
-    q_proj: Optional[torch.nn.Module]
-    indexer: Optional[torch.nn.Module]
+    fused_qkv_a_proj: torch.nn.Module | None
+    kv_a_proj_with_mqa: torch.nn.Module | None
+    q_a_layernorm: torch.nn.Module | None
+    q_b_proj: torch.nn.Module | None
+    q_proj: torch.nn.Module | None
+    indexer: torch.nn.Module | None
     is_sparse: bool
-    topk_indices_buffer: Optional[torch.Tensor]
+    topk_indices_buffer: torch.Tensor | None
 
 
 @CustomOp.register("multi_head_latent_attention")
@@ -55,11 +54,11 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
         qk_nope_head_dim: int,
         qk_rope_head_dim: int,
         v_head_dim: int,
-        q_lora_rank: Optional[int],
+        q_lora_rank: int | None,
         kv_lora_rank: int,
         mla_modules: MLAModules,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 979939ebc4686..010c607bcabf7 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
-from collections.abc import Mapping, Set
+from collections.abc import Callable, Mapping, Set
 from dataclasses import dataclass
 from enum import IntEnum
 from itertools import groupby
-from typing import Callable, Optional, TypeVar, Union
+from typing import TypeVar
 
 import torch
 import torch.nn as nn
@@ -24,8 +24,8 @@ from vllm.v1.pool.metadata import PoolingCursor, PoolingMetadata
 logger = init_logger(__name__)
 
 PoolingFn = Callable[
-    [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
-    Union[torch.Tensor, list[torch.Tensor]],
+    [torch.Tensor | list[torch.Tensor], PoolingMetadata],
+    torch.Tensor | list[torch.Tensor],
 ]
 ClassifierFn = Callable[[torch.Tensor], torch.Tensor]
 
@@ -90,7 +90,7 @@ class Pooler(nn.Module, ABC):
     @staticmethod
     def for_classify(
         pooler_config: PoolerConfig,
-        classifier: Optional[ClassifierFn],
+        classifier: ClassifierFn | None,
     ):
         resolved_config = ResolvedPoolingConfig.from_config(
             task="classify",
@@ -118,14 +118,14 @@ class Pooler(nn.Module, ABC):
     @abstractmethod
     def forward(
         self,
-        hidden_states: Union[list[torch.Tensor], torch.Tensor],
+        hidden_states: list[torch.Tensor] | torch.Tensor,
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         raise NotImplementedError
 
 
 def get_prompt_lens(
-    hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+    hidden_states: torch.Tensor | list[torch.Tensor],
     pooling_metadata: PoolingMetadata,
 ) -> torch.Tensor:
     return pooling_metadata.prompt_lens
@@ -174,7 +174,7 @@ def get_classification_activation_function(config: PretrainedConfig):
 
 
 def get_cross_encoder_activation_function(config: PretrainedConfig):
-    function_name: Optional[str] = None
+    function_name: str | None = None
     if (
         hasattr(config, "sentence_transformers")
         and "activation_fn" in config.sentence_transformers
@@ -223,14 +223,14 @@ class PoolingMethod(nn.Module, ABC):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         raise NotImplementedError
 
     def forward(
         self,
         hidden_states: torch.Tensor,
         pooling_metadata: PoolingMetadata,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         pooling_cursor = pooling_metadata.pooling_cursor
         return self.forward_all(hidden_states, pooling_cursor)
 
@@ -243,7 +243,7 @@ class CLSPool(PoolingMethod):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         assert not pooling_cursor.is_partial_prefill(), (
             "partial prefill not supported with CLS pooling"
         )
@@ -259,7 +259,7 @@ class LastPool(PoolingMethod):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         return hidden_states[pooling_cursor.last_token_indices_gpu]
 
 
@@ -271,7 +271,7 @@ class AllPool(PoolingMethod):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         assert not pooling_cursor.is_partial_prefill(), (
             "partial prefill not supported with ALL pooling"
         )
@@ -290,7 +290,7 @@ class MeanPool(PoolingMethod):
         self,
         hidden_states: torch.Tensor,
         pooling_cursor: PoolingCursor,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         assert not pooling_cursor.is_partial_prefill(), (
             "partial prefill not supported with MEAN pooling"
         )
@@ -405,7 +405,7 @@ class PoolerHead(nn.Module):
 
     def forward(
         self,
-        pooled_data: Union[list[torch.Tensor], torch.Tensor],
+        pooled_data: list[torch.Tensor] | torch.Tensor,
         pooling_metadata: PoolingMetadata,
     ):
         return self.activation(pooled_data)
@@ -418,14 +418,14 @@ class EmbeddingPoolerHead(PoolerHead):
         # Load ST projector if available
 
         vllm_config = get_current_vllm_config()
-        self.projector: Optional[nn.Module] = (
+        self.projector: nn.Module | None = (
             _load_st_projector(vllm_config.model_config) if vllm_config else None
         )
         self.head_dtype = vllm_config.model_config.head_dtype
 
     def forward(
         self,
-        pooled_data: Union[list[torch.Tensor], torch.Tensor],
+        pooled_data: list[torch.Tensor] | torch.Tensor,
         pooling_metadata: PoolingMetadata,
     ):
         if isinstance(pooled_data, list):
@@ -480,7 +480,7 @@ class RewardPoolerHead(PoolerHead):
 
     def forward(
         self,
-        pooled_data: Union[list[torch.Tensor], torch.Tensor],
+        pooled_data: list[torch.Tensor] | torch.Tensor,
         pooling_metadata: PoolingMetadata,
     ):
         if isinstance(pooled_data, list):
@@ -541,7 +541,7 @@ class SimplePooler(Pooler):
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.pooling(hidden_states, pooling_metadata)
@@ -560,9 +560,9 @@ class StepPooler(Pooler):
 
     def extract_states(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
         prompt_token_ids = get_prompt_token_ids(pooling_metadata)
 
@@ -593,7 +593,7 @@ class StepPooler(Pooler):
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.extract_states(hidden_states, pooling_metadata)
@@ -621,8 +621,8 @@ class ClassifierPooler(Pooler):
     def __init__(
         self,
         pooling: PoolingFn,
-        classifier: Optional[ClassifierFn],
-        act_fn: Optional[PoolerActivation] = None,
+        classifier: ClassifierFn | None,
+        act_fn: PoolerActivation | None = None,
     ) -> None:
         super().__init__()
 
@@ -631,7 +631,7 @@ class ClassifierPooler(Pooler):
         self.pooling = pooling
         self.classifier = classifier
         self.act_fn = act_fn or PoolerClassify()
-        self.logit_bias: Optional[float] = (
+        self.logit_bias: float | None = (
             vllm_config.model_config.pooler_config.logit_bias
         )
         self.head_dtype = vllm_config.model_config.head_dtype
@@ -641,7 +641,7 @@ class ClassifierPooler(Pooler):
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.pooling(hidden_states, pooling_metadata)
@@ -695,7 +695,7 @@ class DispatchPooler(Pooler):
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         poolers_by_task = self.poolers_by_task
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index b7ebc6f272db5..2889bc92dccb9 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from fractions import Fraction
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
 import torch
 
@@ -46,8 +46,8 @@ class AutoRoundConfig(QuantizationConfig):
         group_size: int,
         sym: bool = True,
         packing_format: str = "auto_round:auto_gptq",
-        block_name_to_quantize: Optional[Union[str, list[str]]] = None,
-        extra_config: Optional[dict[str, Any]] = None,
+        block_name_to_quantize: str | list[str] | None = None,
+        extra_config: dict[str, Any] | None = None,
         data_type: str = "int",
         backend: str = "auto",
     ) -> None:
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index d4f667564848c..551a4e7cebc5d 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Optional, Union
+from typing import Any, Union
 
 import torch
 
@@ -34,7 +34,7 @@ class AWQConfig(QuantizationConfig):
         weight_bits: int,
         group_size: int,
         zero_point: bool,
-        modules_to_not_convert: Optional[list[str]] = None,
+        modules_to_not_convert: list[str] | None = None,
     ) -> None:
         super().__init__()
         self.weight_bits = weight_bits
@@ -88,7 +88,7 @@ class AWQConfig(QuantizationConfig):
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[Union["LinearMethodBase", "QuantizeMethodBase"]]:
+    ) -> Union["LinearMethodBase", "QuantizeMethodBase"] | None:
         if isinstance(layer, LinearBase):
             if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
                 return UnquantizedLinearMethod()
@@ -227,7 +227,7 @@ class AWQLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         qweight = layer.qweight
         scales = layer.scales
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 5d142387d4d9e..e1633d392dbf6 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 from torch.nn import Parameter
@@ -70,7 +71,7 @@ class AWQMarlinConfig(QuantizationConfig):
         group_size: int,
         zero_point: bool,
         lm_head_quantized: bool,
-        modules_to_not_convert: Optional[list[str]],
+        modules_to_not_convert: list[str] | None,
         full_config: dict[str, Any],
     ) -> None:
         super().__init__()
@@ -140,7 +141,7 @@ class AWQMarlinConfig(QuantizationConfig):
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
-    ) -> Optional[QuantizationMethods]:
+    ) -> QuantizationMethods | None:
         can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
         is_valid_user_quant = (
             user_quant is None or user_quant == "marlin" or user_quant == "awq_marlin"
@@ -360,7 +361,7 @@ class AWQMarlinLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return apply_awq_marlin_linear(
             input=x,
@@ -555,7 +556,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         return None
 
     def apply(
@@ -566,21 +567,21 @@ class AWQMoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 26f5e8bb6c7df..c8a8424eb5c88 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -3,7 +3,7 @@
 
 import inspect
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import torch
 from torch import nn
@@ -105,7 +105,7 @@ class QuantizationConfig(ABC):
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
-    ) -> Optional[QuantizationMethods]:
+    ) -> QuantizationMethods | None:
         """
         Detects if this quantization method can support a given checkpoint
         format by overriding the user specified quantization method --
@@ -135,7 +135,7 @@ class QuantizationConfig(ABC):
     @abstractmethod
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[QuantizeMethodBase]:
+    ) -> QuantizeMethodBase | None:
         """Get the quantize method to use for the quantized layer.
 
         Args:
@@ -147,7 +147,7 @@ class QuantizationConfig(ABC):
         """
         raise NotImplementedError
 
-    def get_cache_scale(self, name: str) -> Optional[str]:
+    def get_cache_scale(self, name: str) -> str | None:
         return None
 
     def apply_vllm_mapper(  # noqa: B027
diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py
index d2e0582be197e..be15f20cac21e 100644
--- a/vllm/model_executor/layers/quantization/bitblas.py
+++ b/vllm/model_executor/layers/quantization/bitblas.py
@@ -45,10 +45,10 @@ class BitBLASConfig(QuantizationConfig):
     def __init__(
         self,
         weight_bits: int,
-        group_size: Optional[int],
-        desc_act: Optional[bool],
-        is_sym: Optional[bool],
-        quant_method: Optional[str],
+        group_size: int | None,
+        desc_act: bool | None,
+        is_sym: bool | None,
+        quant_method: str | None,
         lm_head_quantized: bool,
     ) -> None:
         try:
@@ -160,7 +160,7 @@ class BitBLASConfig(QuantizationConfig):
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
-    ) -> Optional[QuantizationMethods]:
+    ) -> QuantizationMethods | None:
         # compat: autogptq >=0.8.0 use checkpoint_format: str
         # compat: autogptq <=0.7.1 is_bitblas_format: bool
         is_bitblas_format = hf_quant_cfg.get(
@@ -469,7 +469,7 @@ class BitBLASLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         qweight = layer.qweight
         scales = layer.scales
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 80ed121bd85b8..81cf86a7d0eeb 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Union
 
 import torch
 from packaging import version
@@ -41,7 +42,7 @@ class BitsAndBytesConfig(QuantizationConfig):
         bnb_4bit_use_double_quant: bool = False,
         llm_int8_enable_fp32_cpu_offload: bool = False,
         llm_int8_has_fp16_weight: bool = False,
-        llm_int8_skip_modules: Optional[list[str]] = None,
+        llm_int8_skip_modules: list[str] | None = None,
         llm_int8_threshold: float = 6.0,
     ) -> None:
         super().__init__()
@@ -138,7 +139,7 @@ class BitsAndBytesConfig(QuantizationConfig):
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[Union["LinearMethodBase", "BitsAndBytesMoEMethod"]]:
+    ) -> Union["LinearMethodBase", "BitsAndBytesMoEMethod"] | None:
         if isinstance(layer, LinearBase):
             if is_layer_skipped_bnb(prefix, self.llm_int8_skip_modules):
                 return UnquantizedLinearMethod()
@@ -268,7 +269,7 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if self.quant_config.load_in_8bit:
             return self._apply_8bit_weight(layer, x, bias)
@@ -279,7 +280,7 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         # only load the bitsandbytes module when needed
         from bitsandbytes import MatmulLtState, matmul
@@ -359,7 +360,7 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         original_type = x.dtype
         original_shape = x.shape
@@ -489,7 +490,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         return None
 
     def apply(
@@ -500,21 +501,21 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         assert self.fused_experts is None
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index e89d002078ac1..15a0ff23273dd 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -71,7 +71,7 @@ logger = init_logger(__name__)
 __all__ = ["CompressedTensorsLinearMethod"]
 
 SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
-QUANTIZATION_SCHEME_MAP_TYPE = dict[str, Optional[dict[str, QuantizationArgs]]]
+QUANTIZATION_SCHEME_MAP_TYPE = dict[str, dict[str, QuantizationArgs] | None]
 
 
 class CompressedTensorsConfig(QuantizationConfig):
@@ -82,9 +82,9 @@ class CompressedTensorsConfig(QuantizationConfig):
         quant_format: str,
         sparsity_scheme_map: dict[str, SparsityCompressionConfig],
         sparsity_ignore_list: list[str],
-        kv_cache_scheme: Optional[dict[str, Any]] = None,
-        config: Optional[dict[str, Any]] = None,
-        transform_config: Optional[dict[str, Any]] = None,
+        kv_cache_scheme: dict[str, Any] | None = None,
+        config: dict[str, Any] | None = None,
+        transform_config: dict[str, Any] | None = None,
     ):
         super().__init__()
         self.ignore = ignore
@@ -524,7 +524,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         self,
         weight_quant: QuantizationArgs,
         input_quant: QuantizationArgs,
-        format: Optional[str] = None,
+        format: str | None = None,
     ) -> "CompressedTensorsScheme":
         # use the per-layer format if defined, otherwise, use global format
         format = format if format is not None else self.quant_format
@@ -631,7 +631,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         raise NotImplementedError("No compressed-tensors compatible scheme was found.")
 
     def get_scheme(
-        self, layer: torch.nn.Module, layer_name: Optional[str] = None
+        self, layer: torch.nn.Module, layer_name: str | None = None
     ) -> Optional["CompressedTensorsScheme"]:
         """
         compressed-tensors supports non uniform in the following way:
@@ -674,7 +674,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         sparsity_targets = self.sparsity_scheme_map.keys() - set(
             self.sparsity_ignore_list
         )
-        sparsity_scheme: Optional[SparsityCompressionConfig] = None
+        sparsity_scheme: SparsityCompressionConfig | None = None
         with suppress(ValueError):
             matched_target = find_matched_target(
                 layer_name=layer_name,
@@ -723,7 +723,7 @@ class CompressedTensorsConfig(QuantizationConfig):
         logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
         return scheme
 
-    def get_cache_scale(self, name: str) -> Optional[str]:
+    def get_cache_scale(self, name: str) -> str | None:
         """
         Check whether the param name matches the format for k/v cache scales
         in compressed-tensors. If this is the case, return its equivalent
@@ -751,9 +751,9 @@ class CompressedTensorsConfig(QuantizationConfig):
 
     @staticmethod
     def supports_cutlass_24(
-        weight_quant: Optional[QuantizationArgs],
-        input_quant: Optional[QuantizationArgs],
-        sparsity_scheme: Optional[SparsityCompressionConfig] = None,
+        weight_quant: QuantizationArgs | None,
+        input_quant: QuantizationArgs | None,
+        sparsity_scheme: SparsityCompressionConfig | None = None,
     ) -> bool:
         """
         Check if the layer is supported by the Cutlass 2:4 Kernel
@@ -853,7 +853,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ):
         """
         Use the output of create_weights and the CompressedTensorsScheme
@@ -878,7 +878,7 @@ class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):
         super().__init__(quant_config)
 
     @staticmethod
-    def validate_kv_cache_scheme(kv_cache_scheme: Optional[dict[str, Any]]):
+    def validate_kv_cache_scheme(kv_cache_scheme: dict[str, Any] | None):
         """
         Validator for the kv cache scheme. Useful for controlling the
         kv cache quantization schemes, that are being supported in vLLM
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 41e7f1c7a4997..28383491207e7 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
+from collections.abc import Callable
 from enum import Enum
-from typing import Callable, Optional, Union
 
 import torch
 from compressed_tensors import CompressionFormat
@@ -372,7 +372,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
             (layer.w2_input_global_scale), requires_grad=False
         )
 
-    def maybe_make_prepare_finalize(self) -> Optional[mk.FusedMoEPrepareAndFinalize]:
+    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin:
             return None
         elif not self.allow_flashinfer:
@@ -399,7 +399,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         if self.use_marlin:
             return None
 
@@ -420,21 +420,21 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
@@ -913,7 +913,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
                     layer.w2_weight_scale
                 )
 
-    def maybe_make_prepare_finalize(self) -> Optional[mk.FusedMoEPrepareAndFinalize]:
+    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin or self.rocm_aiter_moe_enabled:
             return None
         else:
@@ -997,7 +997,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         if self.use_marlin:
             return None
 
@@ -1022,21 +1022,21 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `CompressedTensorsW8A8Fp8MoEMethod` yet."
@@ -1280,7 +1280,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         return int8_w8a8_moe_quant_config(
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
@@ -1297,21 +1297,21 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.fused_experts is None
 
         if enable_eplb:
@@ -1604,7 +1604,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         return None
 
     def apply(
@@ -1615,21 +1615,21 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.fused_experts is None
 
         if enable_eplb:
@@ -1856,7 +1856,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         assert self.num_bits == 4 or self.num_bits == 8
         config_builder = (
             int4_w4a16_moe_quant_config
@@ -1880,21 +1880,21 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.fused_experts is None
 
         if enable_eplb:
@@ -2092,7 +2092,7 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
         def _pack_matrix(
             int4_as_int8_2d: torch.Tensor,
             scales_2d: torch.Tensor,
-            bias_1d: Optional[torch.Tensor],
+            bias_1d: torch.Tensor | None,
             in_features: int,
             out_features: int,
         ) -> torch.Tensor:
@@ -2192,7 +2192,7 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         # CPU dynamic 4-bit MoE path does not use modular kernels or
         # fused_experts; quant config is not needed.
         return None
@@ -2205,20 +2205,20 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor:
         assert not enable_eplb, "EPLB not supported for W4A8-int MoE yet."
         assert activation in ("silu", "swigluoai", "swiglu"), (
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 93a50a377ee56..571ce267f3fa6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any
 
 import torch
 from compressed_tensors import CompressionFormat, ModelCompressor
@@ -42,9 +43,9 @@ class CompressedTensors24(CompressedTensorsScheme):
     def __init__(
         self,
         quantized: bool = False,
-        weight_quant: Optional[QuantizationArgs] = None,
-        input_quant: Optional[QuantizationArgs] = None,
-        model_compression_config: Optional[dict[str, Any]] = None,
+        weight_quant: QuantizationArgs | None = None,
+        input_quant: QuantizationArgs | None = None,
+        model_compression_config: dict[str, Any] | None = None,
     ):
         self.quantized = quantized
         self.weight_quant = weight_quant
@@ -247,7 +248,7 @@ class CompressedTensors24(CompressedTensorsScheme):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """
         Returns the output tensor for the layer with 2:4
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index 688621cbf79af..a7f9076db7e95 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 
@@ -33,7 +32,7 @@ class CompressedTensorsScheme(ABC):
 
     @abstractmethod
     def apply_weights(
-        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
     ):
         """
         Run the forward pass for the particular scheme. This is where
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index af06418c959da..dd0f4b3d868d1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from torch.nn import Parameter
@@ -30,7 +30,7 @@ W4A16SPARSE24_SUPPORTED_BITS = list(W4A16SPARSE24_SUPPORTED_TYPES_MAP.keys())
 
 
 class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
-    def __init__(self, strategy: str, num_bits: int, group_size: Optional[int] = None):
+    def __init__(self, strategy: str, num_bits: int, group_size: int | None = None):
         self.strategy = strategy
         self.group_size = group_size
         self.tile_size = 16
@@ -143,7 +143,7 @@ class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
         layer.workspace = workspace
 
     def apply_weights(
-        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
     ) -> torch.Tensor:
         qweight = layer.weight_packed
         meta = layer.meta
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
index a96f51538b38c..3afadc6eb7e5b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from torch.nn.parameter import Parameter
@@ -110,7 +110,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return apply_fp4_marlin_linear(
             input=x,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 676f4de6ee7b1..192661c5b7ece 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from torch.nn.parameter import Parameter
@@ -156,7 +156,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
             out = run_nvfp4_emulations(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
index 59d99e1e1c907..a23961e897534 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from compressed_tensors.quantization import ActivationOrdering
@@ -41,9 +41,9 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
         self,
         strategy: str,
         num_bits: int,
-        group_size: Optional[int] = None,
-        symmetric: Optional[bool] = True,
-        actorder: Optional[ActivationOrdering] = None,
+        group_size: int | None = None,
+        symmetric: bool | None = True,
+        actorder: ActivationOrdering | None = None,
     ):
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
@@ -178,6 +178,6 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme):
         self.kernel.process_weights_after_loading(layer)
 
     def apply_weights(
-        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
     ) -> torch.Tensor:
         return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py
index 61a9f6b75cb13..aa0c52beda2b5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_int.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 
@@ -36,7 +36,7 @@ class CompressedTensorsW4A8Int(CompressedTensorsScheme):
         self,
         strategy: str,
         num_bits: int,
-        group_size: Optional[int] = None,
+        group_size: int | None = None,
         is_static_input_scheme: bool = False,
         input_symmetric: bool = True,
     ):
@@ -148,6 +148,6 @@ class CompressedTensorsW4A8Int(CompressedTensorsScheme):
         self.kernel.process_weights_after_loading(layer)
 
     def apply_weights(
-        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
     ) -> torch.Tensor:
         return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
index 709d2538e6ad0..904a9f5d4907d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
@@ -125,7 +125,7 @@ class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return apply_fp8_marlin_linear(
             input=x,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 902c9c7bde97b..ee431c9148b86 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
@@ -179,7 +179,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if self.weight_block_size is not None:
             return self.w8a8_block_fp8_linear.apply(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 70316a7553ca3..6fd0a6a1c822c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
@@ -120,6 +120,6 @@ class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
         self.kernel.process_weights_after_loading(layer)
 
     def apply_weights(
-        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
     ) -> torch.Tensor:
         return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 188fc15fd9485..2267395fe67d3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from compressed_tensors.quantization import ActivationOrdering
@@ -42,9 +42,9 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
         self,
         strategy: str,
         num_bits: int,
-        group_size: Optional[int] = None,
-        symmetric: Optional[bool] = True,
-        actorder: Optional[ActivationOrdering] = None,
+        group_size: int | None = None,
+        symmetric: bool | None = True,
+        actorder: ActivationOrdering | None = None,
     ):
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
@@ -214,6 +214,6 @@ class CompressedTensorsWNA16(CompressedTensorsScheme):
         self.kernel.process_weights_after_loading(layer)
 
     def apply_weights(
-        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
     ) -> torch.Tensor:
         return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
index edd2706b470fd..696356ef1e33b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from itertools import accumulate
-from typing import Callable, Optional
 
 import torch
 from compressed_tensors.transform import (
@@ -38,7 +37,7 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase):
     def from_schemes(
         cls,
         quant_method: LinearMethodBase,
-        quant_scheme: Optional[CompressedTensorsScheme],
+        quant_scheme: CompressedTensorsScheme | None,
         input_tfms: dict[int, TransformTuple],
         output_tfms: dict[int, TransformTuple],
     ) -> "CompressedTensorsLinearTransformMethod":
@@ -66,8 +65,8 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase):
         self.input_tfms = input_tfms
         self.output_tfms = output_tfms
 
-        self.input_transform: Optional[HadamardTransform] = None
-        self.output_transform: Optional[HadamardTransform] = None
+        self.input_transform: HadamardTransform | None = None
+        self.output_transform: HadamardTransform | None = None
 
     def create_weights(
         self,
@@ -151,7 +150,7 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if self.input_transform is not None:
             x = self.input_transform(x)
@@ -194,7 +193,7 @@ class CompressedTensorsLinearTransformMethod(LinearMethodBase):
 def get_linear_transform_schemes(
     layer: torch.nn.Module,
     layer_name: str,
-    transform_config: Optional[TransformConfig],
+    transform_config: TransformConfig | None,
     packed_modules_mapping: dict[str, list[str]],
 ) -> tuple[
     dict[int, TransformTuple], dict[int, TransformTuple]
@@ -226,7 +225,7 @@ def get_linear_transform_schemes(
 
 
 def get_schemes_args(
-    transform_config: Optional[TransformConfig],
+    transform_config: TransformConfig | None,
 ) -> Generator[tuple[str, TransformScheme, TransformArgs]]:
     if transform_config is None:
         return
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
index ecd798257fce2..f5589c8c07fa6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
-from collections.abc import Hashable
-from typing import Callable
+from collections.abc import Callable, Hashable
 
 import torch
 from compressed_tensors.transform import (
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
index b800c5f5d436a..f0bb47a728ad5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 
@@ -17,7 +16,7 @@ __all__ = ["is_qutlass_fp4_scheme", "QutlassNvFP4LinearMethod"]
 
 
 def is_qutlass_fp4_scheme(
-    quant_scheme: Optional[CompressedTensorsScheme],
+    quant_scheme: CompressedTensorsScheme | None,
     input_tfms: dict[int, TransformTuple],
 ) -> bool:
     return (
@@ -60,6 +59,6 @@ class QutlassNvFP4LinearMethod(CompressedTensorsLinearTransformMethod):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         raise NotImplementedError()
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
index ed326197295dd..25c7d335da200 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -145,7 +144,7 @@ def triton_scaled_mm(
     scale_a: torch.Tensor,
     scale_b: torch.Tensor,
     out_dtype: type[torch.dtype],
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
     block_size_m: int = 32,
     block_size_n: int = 32,
     block_size_k: int = 32,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index d8beaafff2ef1..f88092169110b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -3,7 +3,6 @@
 
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
-from typing import Optional
 
 import regex as re
 from compressed_tensors import CompressionFormat
@@ -21,7 +20,7 @@ def is_activation_quantization_format(format: str) -> bool:
 
 
 def should_ignore_layer(
-    layer_name: Optional[str],
+    layer_name: str | None,
     ignore: Iterable[str] = tuple(),
     fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
 ) -> bool:
@@ -84,7 +83,7 @@ def check_equal_or_regex_match(layer_name: str, targets: Iterable[str]) -> bool:
 
 
 def find_matched_target(
-    layer_name: Optional[str],
+    layer_name: str | None,
     module: Module,
     targets: Iterable[str],
     fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
@@ -134,7 +133,7 @@ def find_matched_target(
 
 def _find_first_match(
     value: str, targets: Iterable[str], check_contains: bool = False
-) -> Optional[str]:
+) -> str | None:
     """
     Returns first element of target that matches value either
     exactly or as a regex after 're:'. If check_contains is set to True,
@@ -176,7 +175,7 @@ def _match_fused_layer(
     layer_name: str,
     target_layers: Iterable[str],
     fused_mapping: Mapping[str, list[str]],
-) -> Optional[str]:
+) -> str | None:
     """
     Match a fused layer name to its corresponding individual layer in
     target_layers. Returns first value in fused_mapping which matches targets
@@ -205,7 +204,7 @@ def _match_fused_layer(
     ]
 
     # for each unfused component, find a match in targets
-    unfused_matches: list[Optional[str]] = []
+    unfused_matches: list[str | None] = []
     for unfused in unfused_paths:
         for target in target_layers:
             if _is_equal_or_regex_match(unfused, target):
diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
index 82a2103a19f33..4f742d8345739 100644
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -140,7 +140,7 @@ class DeepSpeedFPLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         weight = layer.weight
         y = weight.ds_dequantize()
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 909b04c79f238..754608af97c6b 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 
@@ -129,7 +130,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         return int8_w8a16_moe_quant_config(
             w1_scale=layer.w13_scale, w2_scale=layer.w2_scale, w1_zp=None, w2_zp=None
         )
@@ -142,21 +143,21 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 5d390cbd7b1ef..6ba18e59e4d54 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -171,7 +171,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if self.quant_config.use_marlin:
             return apply_fp8_marlin_linear(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 73e0044803984..9a03105fafbf6 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Callable
 from enum import Enum
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 from torch.nn import Module
@@ -173,8 +174,8 @@ class Fp8Config(QuantizationConfig):
         self,
         is_checkpoint_fp8_serialized: bool = False,
         activation_scheme: str = "dynamic",
-        ignored_layers: Optional[list[str]] = None,
-        weight_block_size: Optional[list[int]] = None,
+        ignored_layers: list[str] | None = None,
+        weight_block_size: list[int] | None = None,
     ) -> None:
         super().__init__()
 
@@ -298,7 +299,7 @@ class Fp8Config(QuantizationConfig):
             return Fp8KVCacheMethod(self)
         return None
 
-    def get_cache_scale(self, name: str) -> Optional[str]:
+    def get_cache_scale(self, name: str) -> str | None:
         """
         Check whether the param name matches the format for k/v cache scales
         in compressed-tensors. If this is the case, return its equivalent
@@ -530,7 +531,7 @@ class Fp8LinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if self.use_marlin:
             return apply_fp8_marlin_linear(
@@ -584,12 +585,12 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         self.weight_block_size = self.quant_config.weight_block_size
         self.block_quant: bool = self.weight_block_size is not None
 
-        self.fused_experts: Optional[mk.FusedMoEModularKernel] = None  # type: ignore
+        self.fused_experts: mk.FusedMoEModularKernel | None = None  # type: ignore
 
         self.fp8_backend = get_fp8_moe_backend(self.block_quant)
 
         self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
-        self.flashinfer_moe_backend: Optional[FlashinferMoeBackend] = None
+        self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
         if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
             self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
         elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
@@ -970,7 +971,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                     layer.w2_weight_scale_inv
                 )
 
-    def maybe_make_prepare_finalize(self) -> Optional[mk.FusedMoEPrepareAndFinalize]:
+    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
         if (
             self.rocm_aiter_moe_enabled
             or self.use_marlin
@@ -1043,7 +1044,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         if self.use_marlin:
             return None
 
@@ -1069,21 +1070,21 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             assert expert_load_view is not None
             assert logical_to_physical_map is not None
diff --git a/vllm/model_executor/layers/quantization/fp_quant.py b/vllm/model_executor/layers/quantization/fp_quant.py
index 929e603149905..f00ea17ab6773 100644
--- a/vllm/model_executor/layers/quantization/fp_quant.py
+++ b/vllm/model_executor/layers/quantization/fp_quant.py
@@ -3,7 +3,7 @@
 
 # Supports FP-Quant compression, see https://arxiv.org/abs/2509.23202
 
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch.nn.parameter import Parameter
@@ -36,7 +36,7 @@ class FPQuantConfig(QuantizationConfig):
         forward_dtype: str = "mxfp4",
         forward_method: str = "abs_max",
         pseudoquantization: bool = False,
-        modules_to_not_convert: Optional[list[str]] = None,
+        modules_to_not_convert: list[str] | None = None,
     ) -> None:
         super().__init__()
         self.hadamard_group_size = hadamard_group_size
@@ -90,7 +90,7 @@ class FPQuantConfig(QuantizationConfig):
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[LinearMethodBase]:
+    ) -> LinearMethodBase | None:
         if self.modules_to_not_convert is not None and any(
             prefix.endswith(module) for module in self.modules_to_not_convert
         ):
@@ -233,7 +233,7 @@ class FPQuantLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return quantized_forward(
             x,
@@ -381,7 +381,7 @@ def quantized_forward(
     weight_scales: torch.Tensor,
     weight_global_scale: torch.Tensor,
     act_global_scale: torch.Tensor,
-    bias: Optional[torch.Tensor],
+    bias: torch.Tensor | None,
     forward_hadamard_matrix: torch.Tensor,
     forward_method: str,
     forward_dtype: str,
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 8296bc2ea3b48..84cd07a0c1743 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional
 
 import gguf
 import torch
@@ -35,7 +36,7 @@ logger = init_logger(__name__)
 class GGUFConfig(QuantizationConfig):
     """Config class for GGUF."""
 
-    def __init__(self, unquantized_modules: Optional[list[str]] = None) -> None:
+    def __init__(self, unquantized_modules: list[str] | None = None) -> None:
         super().__init__()
         self.unquantized_modules = unquantized_modules or []
 
@@ -307,7 +308,7 @@ def _apply_gguf_embedding(
     qweight: torch.Tensor,
     qweight_type: int,
     hidden_size: int,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     if qweight_type in UNQUANTIZED_TYPES:
         return torch.embedding(qweight, x)
@@ -330,7 +331,7 @@ def _apply_gguf_embedding_fake(
     qweight: torch.Tensor,
     qweight_type: int,
     hidden_size: int,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     return torch.empty(x.shape[0], hidden_size, dtype=dtype, device=x.device)
 
@@ -452,7 +453,7 @@ class GGUFLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         shard_id = layer.qweight.shard_id
 
@@ -558,7 +559,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         return None
 
     def apply(
@@ -569,21 +570,21 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 8f36fc70c4447..f65c6156d040a 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -4,7 +4,7 @@
 import enum
 from enum import Enum
 from fractions import Fraction
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Union
 
 import torch
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
@@ -48,9 +48,9 @@ class GPTQConfig(QuantizationConfig):
         group_size: int,
         desc_act: bool,
         lm_head_quantized: bool,
-        dynamic: dict[str, dict[str, Union[int, bool]]],
+        dynamic: dict[str, dict[str, int | bool]],
         autoround_version: str = "",
-        modules_in_block_to_quantize: Optional[list[str]] = None,
+        modules_in_block_to_quantize: list[str] | None = None,
     ) -> None:
         # GPTQModel use `dynamic` config property to allow per module
         # quantization config so each module can be individually optimized.
@@ -148,7 +148,7 @@ class GPTQConfig(QuantizationConfig):
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[Union["GPTQLinearMethod", "QuantizeMethodBase"]]:
+    ) -> Union["GPTQLinearMethod", "QuantizeMethodBase"] | None:
         if isinstance(layer, FusedMoE):
             # GPTQ MoE support: fall back to MoeWNA16 for broad compatibility
             from .moe_wna16 import MoeWNA16Config
@@ -170,7 +170,7 @@ class GPTQConfig(QuantizationConfig):
                 self.modules_in_block_to_quantize
             )
 
-    def maybe_update_config(self, model_name: str, revision: Optional[str] = None):
+    def maybe_update_config(self, model_name: str, revision: str | None = None):
         if self.modules_in_block_to_quantize:
             if is_list_of(self.modules_in_block_to_quantize, list):
                 # original modules_in_block_to_quantize: list[list[str]]
@@ -345,7 +345,7 @@ class GPTQLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         out_shape = x.shape[:-1] + (layer.qweight.shape[-1],)
         reshaped_x = x.reshape(-1, x.shape[-1])
diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py
index 85cf4ed4ac58c..92f10bfd5c02d 100644
--- a/vllm/model_executor/layers/quantization/gptq_bitblas.py
+++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py
@@ -71,7 +71,7 @@ class GPTQBitBLASConfig(QuantizationConfig):
         group_size: int,
         desc_act: bool,
         is_sym: bool,
-        quant_method: Optional[str],
+        quant_method: str | None,
         lm_head_quantized: bool,
     ) -> None:
         try:
@@ -180,7 +180,7 @@ class GPTQBitBLASConfig(QuantizationConfig):
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
-    ) -> Optional[QuantizationMethods]:
+    ) -> QuantizationMethods | None:
         can_convert = cls.is_gptq_bitblas_compatible(hf_quant_cfg)
 
         is_valid_user_quant = (
@@ -474,7 +474,7 @@ class GPTQBitBLASLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         out = self.kernel.apply_gptq_bitblas_linear(layer, x)
         if bias is not None:
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 8fa70a240f9ff..dd86c990259f1 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Callable
 from copy import deepcopy
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional
 
 import torch
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
@@ -103,9 +104,9 @@ class GPTQMarlinConfig(QuantizationConfig):
         desc_act: bool,
         is_sym: bool,
         lm_head_quantized: bool,
-        dynamic: dict[str, dict[str, Union[int, bool]]],
+        dynamic: dict[str, dict[str, int | bool]],
         full_config: dict[str, Any],
-        modules_in_block_to_quantize: Optional[list[str]] = None,
+        modules_in_block_to_quantize: list[str] | None = None,
     ) -> None:
         super().__init__()
         if desc_act and group_size == -1:
@@ -211,7 +212,7 @@ class GPTQMarlinConfig(QuantizationConfig):
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
-    ) -> Optional[QuantizationMethods]:
+    ) -> QuantizationMethods | None:
         can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
 
         is_valid_user_quant = (
@@ -283,7 +284,7 @@ class GPTQMarlinConfig(QuantizationConfig):
                 self.modules_in_block_to_quantize
             )
 
-    def maybe_update_config(self, model_name: str, revision: Optional[str] = None):
+    def maybe_update_config(self, model_name: str, revision: str | None = None):
         if self.modules_in_block_to_quantize:
             if is_list_of(self.modules_in_block_to_quantize, list):
                 # original modules_in_block_to_quantize: list[list[str]]
@@ -459,7 +460,7 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return self.kernel.apply_weights(layer, x, bias)
 
@@ -714,7 +715,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         return None
 
     def apply(
@@ -725,21 +726,21 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index 8f0df55b0a5cf..2fb614b4746ea 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -114,7 +114,7 @@ class GPTQMarlin24Config(QuantizationConfig):
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
-    ) -> Optional[QuantizationMethods]:
+    ) -> QuantizationMethods | None:
         is_marlin_24_format = hf_quant_cfg.get("checkpoint_format") == "marlin_24"
 
         is_valid_user_quant = (
@@ -287,7 +287,7 @@ class GPTQMarlin24LinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         qweight = layer.B_24
         meta = layer.B_meta
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index e61caf6b459b0..5fb67c35378be 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -45,7 +45,7 @@ class HQQMarlinConfig(QuantizationConfig):
         self,
         weight_bits: int,
         group_size: int,
-        skip_modules: Optional[list[str]] = None,
+        skip_modules: list[str] | None = None,
     ) -> None:
         super().__init__()
         assert group_size == 64, "The only supported HQQ group size is currently 64."
@@ -327,7 +327,7 @@ class HQQMarlinMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         workspace = MarlinWorkspace(
             self.output_size_per_partition,
diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index 8786638869a4e..7ded8eea79060 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -30,9 +29,9 @@ class QuantFP8(CustomOp):
         self,
         static: bool,
         group_shape: GroupShape,
-        num_token_padding: Optional[int] = None,
+        num_token_padding: int | None = None,
         column_major_scales: bool = False,
-        use_ue8m0: Optional[bool] = None,  # for Torch compile
+        use_ue8m0: bool | None = None,  # for Torch compile
     ):
         """
         :param static: static or dynamic quantization
@@ -64,8 +63,8 @@ class QuantFP8(CustomOp):
     def forward_cuda(
         self,
         x: torch.Tensor,
-        scale: Optional[torch.Tensor] = None,
-        scale_ub: Optional[torch.Tensor] = None,
+        scale: torch.Tensor | None = None,
+        scale_ub: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if self.is_group_quant:
             assert scale is None, "Group quantization is always dynamic"
@@ -96,8 +95,8 @@ class QuantFP8(CustomOp):
     def forward_native(
         self,
         x: torch.Tensor,
-        scale: Optional[torch.Tensor] = None,
-        scale_ub: Optional[torch.Tensor] = None,
+        scale: torch.Tensor | None = None,
+        scale_ub: torch.Tensor | None = None,
     ):
         if self.is_group_quant:
             assert scale is None, "Group quantization is always dynamic"
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 4aa0e464e0f53..8616e8f4516aa 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 from packaging import version
@@ -50,9 +51,9 @@ class IPEXConfig(QuantizationConfig):
         method: str,
         weight_bits: int,
         group_size: int,
-        modules_to_not_convert: Optional[list[str]] = None,
-        desc_act: Optional[bool] = None,
-        lm_head_quantized: Optional[bool] = None,
+        modules_to_not_convert: list[str] | None = None,
+        desc_act: bool | None = None,
+        lm_head_quantized: bool | None = None,
     ) -> None:
         super().__init__()
         self.method = method
@@ -122,7 +123,7 @@ class IPEXConfig(QuantizationConfig):
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
-    ) -> Optional[QuantizationMethods]:
+    ) -> QuantizationMethods | None:
         if not current_platform.is_cpu() and not current_platform.is_xpu():
             return None
 
@@ -206,7 +207,7 @@ class IPEXGPTQLinearMethod(GPTQLinearMethod):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         reshaped_x = x.reshape(-1, x.shape[-1])
         out = layer.ipex_qlinear(reshaped_x)
@@ -275,7 +276,7 @@ class IPEXAWQLinearMethod(AWQLinearMethod):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         reshaped_x = x.reshape(-1, x.shape[-1])
         out = layer.ipex_qlinear(reshaped_x)
@@ -299,7 +300,7 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         weight = layer.weight.data
         weight_scale = layer.weight_scale.data
@@ -410,7 +411,7 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         return None
 
     def apply(
@@ -421,20 +422,20 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return layer.ipex_fusion(
             x,
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
index 055a3ebbced61..7aeb1f86c2794 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Callable, Optional
 
 import torch
 
@@ -20,7 +20,7 @@ class MPLinearLayerConfig:
     group_size: int
     zero_points: bool
     has_g_idx: bool
-    out_type: Optional[torch.dtype] = None
+    out_type: torch.dtype | None = None
 
 
 class MPLinearKernel(ABC):
@@ -31,7 +31,7 @@ class MPLinearKernel(ABC):
 
     @classmethod
     @abstractmethod
-    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         raise NotImplementedError
 
     def __init__(
@@ -39,8 +39,8 @@ class MPLinearKernel(ABC):
         c: MPLinearLayerConfig,
         w_q_param_name: str,
         w_s_param_name: str,
-        w_zp_param_name: Optional[str] = None,
-        w_gidx_param_name: Optional[str] = None,
+        w_zp_param_name: str | None = None,
+        w_gidx_param_name: str | None = None,
     ) -> None:
         assert self.can_implement(c)
         self.config = c
@@ -62,12 +62,12 @@ class MPLinearKernel(ABC):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         raise NotImplementedError
 
     def _transform_param(
-        self, layer: torch.nn.Module, name: Optional[str], fn: Callable
+        self, layer: torch.nn.Module, name: str | None, fn: Callable
     ) -> None:
         if name is not None and getattr(layer, name, None) is not None:
             old_param = getattr(layer, name)
@@ -83,8 +83,8 @@ class MPLinearKernel(ABC):
     ) -> tuple[
         torch.Tensor,  # w_q
         torch.Tensor,  # w_s
-        Optional[torch.Tensor],  # w_zp,
-        Optional[torch.Tensor],  # w_gidx
+        torch.Tensor | None,  # w_zp,
+        torch.Tensor | None,  # w_gidx
     ]:
         return (
             getattr(layer, self.w_q_name),
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index 1759d142e6cc1..0cf3f12af5522 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
-
 import vllm.envs as envs
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.allspark import (  # noqa: E501
     AllSparkLinearKernel,
@@ -48,7 +46,7 @@ _POSSIBLE_KERNELS: list[type[MPLinearKernel]] = [
 
 
 def choose_mp_linear_kernel(
-    config: MPLinearLayerConfig, compute_capability: Optional[int] = None
+    config: MPLinearLayerConfig, compute_capability: int | None = None
 ) -> type[MPLinearKernel]:
     """
     Choose an MPLinearKernel that can implement the given config for the given
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
index c353372b05ec1..3baef454251a0 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -22,7 +21,7 @@ class AllSparkLinearKernel(MPLinearKernel):
         return 80
 
     @classmethod
-    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         if c.has_g_idx:
             return False, "Act reordering currently not supported by AllSpark"
 
@@ -87,7 +86,7 @@ class AllSparkLinearKernel(MPLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         c = self.config
         gemm_args = self.gemm_args
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
index d1ff582c4e216..59c6a4f961547 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 from packaging import version
@@ -44,9 +43,9 @@ class BitBLASLinearKernel(MPLinearKernel):
         c: MPLinearLayerConfig,
         w_q_param_name: str,
         w_s_param_name: str,
-        w_zp_param_name: Optional[str] = None,
-        w_gidx_param_name: Optional[str] = None,
-        bitblas_quant_config: Optional[QuantizationConfig] = None,
+        w_zp_param_name: str | None = None,
+        w_gidx_param_name: str | None = None,
+        bitblas_quant_config: QuantizationConfig | None = None,
     ):
         self.quant_config = bitblas_quant_config
         super().__init__(
@@ -57,7 +56,7 @@ class BitBLASLinearKernel(MPLinearKernel):
         self,
         b_q_weight: torch.Tensor,
         scales: torch.Tensor,
-        qzeros: Optional[torch.Tensor] = None,
+        qzeros: torch.Tensor | None = None,
     ):
         from bitblas.quantization.utils import general_compress
 
@@ -82,7 +81,7 @@ class BitBLASLinearKernel(MPLinearKernel):
         # qzeros should be de-quantized to int zeros.
         weight_bits = quant_config.weight_bits  # type: ignore[union-attr]
         intzeros = unpack_gptq_qzeros(qzeros, weight_bits).T.contiguous()
-        zeros: Optional[torch.Tensor] = None
+        zeros: torch.Tensor | None = None
         zeros_mode = self.bitblas_matmul.config.zeros_mode  # type: ignore[attr-defined]
         if zeros_mode == "original":
             zeros = intzeros.to(torch.float16).contiguous()
@@ -113,7 +112,7 @@ class BitBLASLinearKernel(MPLinearKernel):
         return 70
 
     @classmethod
-    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         is_bitblas_installed = True
 
         try:
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py
index 281fca7888ab3..53b2e15df76db 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from importlib.util import find_spec
-from typing import Final, Optional
+from typing import Final
 
 import torch
 
@@ -26,7 +26,7 @@ class ConchLinearKernel(MPLinearKernel):
         return 80
 
     @classmethod
-    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         if c.weight_type not in _CONCH_SUPPORTED_WEIGHT_TYPES:
             error_msg = (
                 f"Weight type ({c.weight_type}) not supported by "
@@ -76,7 +76,7 @@ class ConchLinearKernel(MPLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         from conch.ops.quantization.gemm import mixed_precision_gemm
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
index f5df7a244b426..8ef6457c952f1 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -26,7 +25,7 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
         return 90
 
     @classmethod
-    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         if not current_platform.is_cuda():
             return False, "CUTLASS only supported on CUDA"
 
@@ -95,7 +94,7 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         c = self.config
         w_q, w_s, _, _ = self._get_weight_params(layer)
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
index 7631236e6f642..d09bd86a7274a 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -20,7 +19,7 @@ class Dynamic4bitLinearKernel(MPLinearKernel):
         return 1
 
     @classmethod
-    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         if not current_platform.is_cpu():
             return False, "Only CPU is supported"
         if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
@@ -95,7 +94,7 @@ class Dynamic4bitLinearKernel(MPLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         c = self.config
         x_2d = x.reshape(-1, x.shape[-1])
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
index a57d3f65267ec..27d8344f6b488 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -25,7 +24,7 @@ class ExllamaLinearKernel(MPLinearKernel):
         return 60
 
     @classmethod
-    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         if c.has_g_idx and c.partition_weight_shape[0] != c.full_weight_shape[0]:
             return (
                 False,
@@ -137,7 +136,7 @@ class ExllamaLinearKernel(MPLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         c = self.config
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
index df2f8fedce7e7..7953ed5e8ee47 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import partial
-from typing import Optional
 
 import torch
 
@@ -28,7 +27,7 @@ class MacheteLinearKernel(MPLinearKernel):
         return 90
 
     @classmethod
-    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         # Machete uses CUTLASS, so it can only be compatible with Nvidia
         if not current_platform.is_cuda():
             return False, "Machete only supported on CUDA"
@@ -129,7 +128,7 @@ class MacheteLinearKernel(MPLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         c = self.config
         w_q, w_s, w_zp, _ = self._get_weight_params(layer)
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index 0be448e4e3d8a..ac21286eeffac 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -32,7 +31,7 @@ class MarlinLinearKernel(MPLinearKernel):
         return 80
 
     @classmethod
-    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
         # Marlin uses inline PTX, so it can only be compatible with Nvidia
         if not current_platform.is_cuda():
             return False, "Marlin only supported on CUDA"
@@ -144,7 +143,7 @@ class MarlinLinearKernel(MPLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         c = self.config
         w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
index d9b999e3d5ddc..2a885ec899458 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py
@@ -3,7 +3,6 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -23,7 +22,7 @@ class ScaledMMLinearKernel(ABC):
 
     @classmethod
     @abstractmethod
-    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         raise NotImplementedError
 
     def __init__(
@@ -52,7 +51,7 @@ class ScaledMMLinearKernel(ABC):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         raise NotImplementedError
 
@@ -61,9 +60,9 @@ class ScaledMMLinearKernel(ABC):
     ) -> tuple[
         torch.Tensor,  # weight
         torch.Tensor,  # weight_scale
-        Optional[torch.Tensor],  # input_scale,
-        Optional[torch.Tensor],  # input_zp
-        Optional[torch.Tensor],  # azp_adj
+        torch.Tensor | None,  # input_scale,
+        torch.Tensor | None,  # input_zp
+        torch.Tensor | None,  # azp_adj
     ]:
         return (
             getattr(layer, self.w_q_name),
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index ee5416bae01c6..dd59e5d935dcb 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Optional
 
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
     AiterScaledMMLinearKernel,
@@ -35,7 +34,7 @@ _POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = {
 
 
 def choose_scaled_mm_linear_kernel(
-    config: ScaledMMLinearLayerConfig, compute_capability: Optional[int] = None
+    config: ScaledMMLinearLayerConfig, compute_capability: int | None = None
 ) -> type[ScaledMMLinearKernel]:
     """
     Choose an ScaledMMLinearKernel that can implement the given config for the
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
index e97beefdd9c2c..5e133aac10fa0 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -19,7 +18,7 @@ def rocm_aiter_gemm_w8a8_impl(
     B: torch.Tensor,
     As: torch.Tensor,
     Bs: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
     output_dtype: torch.dtype = torch.float16,
 ) -> torch.Tensor:
     from aiter import gemm_a8w8_CK
@@ -36,7 +35,7 @@ def rocm_aiter_gemm_w8a8_fake(
     B: torch.Tensor,
     As: torch.Tensor,
     Bs: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
     output_dtype: torch.dtype = torch.float16,
 ) -> torch.Tensor:
     m = A.shape[0]
@@ -59,7 +58,7 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
         return 90
 
     @classmethod
-    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         if not current_platform.is_rocm():
             return (
                 False,
@@ -99,7 +98,7 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """
         `AiterScaledMMLinearKernel` implements a fused version of
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
index cb00b0c8af210..feb1e0bee1aaf 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -24,7 +23,7 @@ class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
         return 75
 
     @classmethod
-    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         if not current_platform.is_cpu():
             return False, "CPUScaledMM requires running on CPU."
 
@@ -173,7 +172,7 @@ class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return self.linear_method(
             layer,
@@ -185,7 +184,7 @@ class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
 
@@ -207,7 +206,7 @@ class CPUScaledMMLinearKernel(ScaledMMLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         w_q, w_s, _, _, _ = self._get_weight_params(layer)
         return torch.ops._C.int8_scaled_mm_with_quant(
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
index 13dbd55c32df9..e8769916b4cef 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -21,7 +20,7 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
         return 75
 
     @classmethod
-    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         if not current_platform.is_cuda():
             return False, "CutlassScaledMM requires running on CUDA."
 
@@ -110,7 +109,7 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
index 7e21afca5750c..3f4ec7f2a738b 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -17,7 +16,7 @@ class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
         return 75
 
     @classmethod
-    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         if current_platform.is_cpu():
             return (
                 False,
@@ -38,6 +37,6 @@ class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return super().apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
index 63eee1e288618..ddac9f13cf4f3 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
-from typing import Optional
 
 import torch
 from functorch.experimental.control_flow import cond  # noqa: F401
@@ -25,7 +24,7 @@ class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
         )
 
     @classmethod
-    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, Optional[str]]:
+    def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
         if not current_platform.is_tpu():
             return False, "ScaledMMXLA requires running on TPU."
 
@@ -77,17 +76,17 @@ class XLAScaledMMLinearKernel(ScaledMMLinearKernel):
             message="Pred is a Python constant. When used with torch.cond, it specializes on one of the branches.",  # noqa: E501
         )
 
-    def no_add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+    def no_add_bias(self, x: torch.Tensor, bias: torch.Tensor | None):
         return x
 
-    def add_bias(self, x: torch.Tensor, bias: Optional[torch.Tensor]):
+    def add_bias(self, x: torch.Tensor, bias: torch.Tensor | None):
         return x + bias
 
     def apply_weights(
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         w_q, w_s, _, _, _ = self._get_weight_params(layer)
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index c285b10720d86..7c7769455e8a4 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 from torch.nn import Module
@@ -92,8 +93,8 @@ class ModelOptFp8Config(QuantizationConfig):
     def __init__(
         self,
         is_checkpoint_fp8_serialized: bool = False,
-        kv_cache_quant_method: Optional[str] = None,
-        exclude_modules: Optional[list[str]] = None,
+        kv_cache_quant_method: str | None = None,
+        exclude_modules: list[str] | None = None,
     ) -> None:
         super().__init__()
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
@@ -128,7 +129,7 @@ class ModelOptFp8Config(QuantizationConfig):
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
-    ) -> Optional[QuantizationMethods]:
+    ) -> QuantizationMethods | None:
         """Detect if this ModelOpt config should be used based on
         quantization config."""
 
@@ -319,7 +320,7 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return self.fp8_linear.apply(
             input=x,
@@ -351,7 +352,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         )
 
         self.cutlass_fp8_supported = cutlass_fp8_supported()
-        self.flashinfer_moe_backend: Optional[FlashinferMoeBackend] = None
+        self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
         if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
@@ -360,7 +361,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
 
     def maybe_make_prepare_finalize(
         self,
-    ) -> Optional[mk.FusedMoEPrepareAndFinalize]:
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         # TRT LLM not supported with all2all yet.
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
             return None
@@ -541,7 +542,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
             return None
 
@@ -561,21 +562,21 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptFp8MoEMethod` yet."
@@ -674,7 +675,7 @@ class ModelOptNvFp4Config(QuantizationConfig):
     def __init__(
         self,
         is_checkpoint_nvfp4_serialized: bool,
-        kv_cache_quant_algo: Optional[str],
+        kv_cache_quant_algo: str | None,
         exclude_modules: list[str],
         group_size: int = 16,
     ) -> None:
@@ -713,7 +714,7 @@ class ModelOptNvFp4Config(QuantizationConfig):
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
-    ) -> Optional[QuantizationMethods]:
+    ) -> QuantizationMethods | None:
         """Detect if this ModelOpt FP4 config should be used based on
         quantization config."""
         if hf_quant_cfg is None:
@@ -906,7 +907,7 @@ class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
     Supports loading kv-cache scaling factors from FP8 checkpoints.
     """
 
-    def __init__(self, quant_config: Union[ModelOptFp8Config, ModelOptNvFp4Config]):
+    def __init__(self, quant_config: ModelOptFp8Config | ModelOptNvFp4Config):
         super().__init__(quant_config)
 
 
@@ -1071,7 +1072,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if self.backend == "marlin":
             return apply_fp4_marlin_linear(
@@ -1162,7 +1163,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
                 " for ModelOptNvFp4FusedMoE."
             )
 
-    def maybe_make_prepare_finalize(self) -> Optional[mk.FusedMoEPrepareAndFinalize]:
+    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin or (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
@@ -1565,7 +1566,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         if (
             self.use_marlin
             or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
@@ -1589,21 +1590,21 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 3719672f6e52f..b0a268b9950b7 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 
@@ -40,7 +41,7 @@ class MoeWNA16Config(QuantizationConfig):
         group_size: int,
         has_zp: bool,
         lm_head_quantized: bool,
-        modules_to_not_convert: Optional[list[str]],
+        modules_to_not_convert: list[str] | None,
         full_config: dict[str, Any],
     ) -> None:
         super().__init__()
@@ -127,7 +128,7 @@ class MoeWNA16Config(QuantizationConfig):
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
-    ) -> Optional[QuantizationMethods]:
+    ) -> QuantizationMethods | None:
         can_convert = cls.is_moe_wna16_compatible(hf_quant_cfg)
         if can_convert and user_quant == "moe_wna16":
             return cls.get_name()
@@ -339,7 +340,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         weight_bits = self.quant_config.weight_bits
         has_zp = self.quant_config.has_zp
         assert weight_bits == 4 or weight_bits == 8
@@ -365,21 +366,21 @@ class MoeWNA16Method(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.fused_experts is None
         if enable_eplb:
             raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.")
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index dd9532be7585c..5d78b82e3ee7c 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
 from enum import Enum
-from typing import Callable, Optional, Union
+from typing import Optional
 
 import torch
 from torch.nn.parameter import Parameter
@@ -137,7 +138,7 @@ def get_mxfp4_backend():
 
 
 class Mxfp4Config(QuantizationConfig):
-    def __init__(self, ignored_layers: Optional[list[str]] = None):
+    def __init__(self, ignored_layers: list[str] | None = None):
         super().__init__()
         self.ignored_layers = ignored_layers
 
@@ -756,7 +757,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
             return mxfp4_w4a16_moe_quant_config(
                 w1_bias=layer.w13_bias,
@@ -824,19 +825,19 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor:
         assert isinstance(self.fused_experts, mk.FusedMoEModularKernel)
 
@@ -890,21 +891,21 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         if enable_eplb:
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py
index 60519bdaea028..402cebc38c215 100644
--- a/vllm/model_executor/layers/quantization/petit.py
+++ b/vllm/model_executor/layers/quantization/petit.py
@@ -41,9 +41,9 @@ class PetitNvFp4Config(QuantizationConfig):
     def __init__(
         self,
         is_checkpoint_nvfp4_serialized: bool = False,
-        kv_cache_quant_algo: Optional[str] = None,
-        group_size: Optional[int] = None,
-        exclude_modules: Optional[list[str]] = None,
+        kv_cache_quant_algo: str | None = None,
+        group_size: int | None = None,
+        exclude_modules: list[str] | None = None,
     ) -> None:
         self._check_hardware_support()
         self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
@@ -133,7 +133,7 @@ class PetitNvFp4Config(QuantizationConfig):
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
-    ) -> Optional[QuantizationMethods]:
+    ) -> QuantizationMethods | None:
         if not current_platform.is_rocm():
             return None
 
@@ -307,7 +307,7 @@ class PetitNvFp4LinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return apply_petit_nvfp4_linear(
             input=x,
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index c0156321f65d2..26ba8e5b16bc0 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -34,7 +34,7 @@ class PTPCFp8Config(Fp8Config):
     def __init__(
         self,
         activation_scheme: str = "dynamic",
-        ignored_layers: Optional[list[str]] = None,
+        ignored_layers: list[str] | None = None,
     ) -> None:
         if not current_platform.is_rocm():
             raise ValueError("ptpc_fp8 quantization is supported only on ROCm.")
@@ -125,7 +125,7 @@ class PTPCFp8LinearMethod(Fp8LinearMethod):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return self.fp8_linear.apply(
             input=x,
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 51f9d56121bdd..d5459594b7983 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -43,8 +43,8 @@ class QuarkConfig(QuantizationConfig):
     def __init__(
         self,
         quant_config: dict[str, Any],
-        kv_cache_group: Optional[list[str]] = None,
-        kv_cache_config: Optional[dict[str, Any]] = None,
+        kv_cache_group: list[str] | None = None,
+        kv_cache_config: dict[str, Any] | None = None,
         pack_method: str = "reorder",
     ):
         super().__init__()
@@ -178,8 +178,8 @@ class QuarkConfig(QuantizationConfig):
 
     def _is_fp8_w8a8(
         self,
-        weight_quant: Optional[dict[str, Any]],
-        input_quant: Optional[dict[str, Any]],
+        weight_quant: dict[str, Any] | None,
+        input_quant: dict[str, Any] | None,
     ) -> bool:
         # Confirm weights and input quantized.
         if weight_quant is None or input_quant is None:
@@ -209,8 +209,8 @@ class QuarkConfig(QuantizationConfig):
 
     def _is_static_tensor_w8a8(
         self,
-        weight_quant: Optional[dict[str, Any]],
-        input_quant: Optional[dict[str, Any]],
+        weight_quant: dict[str, Any] | None,
+        input_quant: dict[str, Any] | None,
     ) -> bool:
         # Confirm weights and input quantized.
         if weight_quant is None or input_quant is None:
@@ -237,8 +237,8 @@ class QuarkConfig(QuantizationConfig):
 
     def _is_ocp_mx(
         self,
-        weight_quant: Optional[dict[str, Any]],
-        input_quant: Optional[dict[str, Any]],
+        weight_quant: dict[str, Any] | None,
+        input_quant: dict[str, Any] | None,
     ) -> bool:
         # Confirm weights and input quantized.
         if weight_quant is None or input_quant is None:
@@ -370,7 +370,7 @@ class QuarkConfig(QuantizationConfig):
 
         return scheme
 
-    def get_cache_scale(self, name: str) -> Optional[str]:
+    def get_cache_scale(self, name: str) -> str | None:
         """
         Check whether the param name matches the format for k/v cache scales
         in quark. If this is the case, return its equivalent param name
@@ -429,7 +429,7 @@ class QuarkLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ):
         """
         Use the output of create_weights and the CompressedTensorsScheme
@@ -454,7 +454,7 @@ class QuarkKVCacheMethod(BaseKVCacheMethod):
         super().__init__(quant_config)
 
     @staticmethod
-    def validate_kv_cache_config(kv_cache_config: Optional[dict[str, Any]]):
+    def validate_kv_cache_config(kv_cache_config: dict[str, Any] | None):
         """
         Validator for the kv cache configuration. Useful for controlling the
         kv cache quantization schemes, that are being supported in vLLM
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index f00188a6f8c40..778317e3a9592 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any
 
 import torch
 
@@ -333,7 +334,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         return fp8_w8a8_moe_quant_config(
             w1_scale=layer.w13_weight_scale,
             w2_scale=layer.w2_weight_scale,
@@ -350,21 +351,21 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.fused_experts is None
 
         if enable_eplb:
@@ -568,7 +569,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         return ocp_mx_moe_quant_config(
             quant_dtype=self.input_dtype,
             weight_dtype=self.weight_dtype,
@@ -587,21 +588,21 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
index 0eefa7f7e96c9..1bc1171843d58 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Callable
 from fractions import Fraction
 from functools import cache, partial
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -54,8 +55,8 @@ try:
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
         rocm_use_aiter_fp4_asm_gemm: bool = False,
-        out_dtype: Optional[torch.dtype] = torch.bfloat16,
-        x_scales: Optional[torch.Tensor] = None,
+        out_dtype: torch.dtype | None = torch.bfloat16,
+        x_scales: torch.Tensor | None = None,
     ) -> torch.Tensor:
         M = x.shape[0]
         if rocm_use_aiter_fp4_asm_gemm:
@@ -95,7 +96,7 @@ try:
         weight_scale: torch.Tensor,
         x_scales: torch.Tensor = None,
         rocm_use_aiter_fp4_asm_gemm: bool = False,
-        out_dtype: Optional[torch.dtype] = torch.bfloat16,
+        out_dtype: torch.dtype | None = torch.bfloat16,
     ) -> torch.Tensor:
         return torch.empty(
             (*x.shape[:-1], weight.shape[0]), dtype=out_dtype, device=x.device
@@ -129,7 +130,7 @@ class QuarkOCP_MX(QuarkScheme):
         )
 
         if self.weight_dtype == "mxfp4":
-            self.packed_factor: Union[int, Fraction] = 2
+            self.packed_factor: int | Fraction = 2
             self.dequant_func = dequant_mxfp4
         else:
             self.packed_factor = Fraction(numerator=8, denominator=6)
@@ -282,7 +283,7 @@ class QuarkOCP_MX(QuarkScheme):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if self.emulate:
             dq_w = self.dequant_func(layer.weight, layer.weight_scale, x.dtype)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
index ddec0f6ea8eb8..412a07a85fe75 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 
@@ -33,7 +32,7 @@ class QuarkScheme(ABC):
 
     @abstractmethod
     def apply_weights(
-        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
     ):
         """
         Run the forward pass for the particular scheme. This is where
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index 553698a7dc94a..1e5ee93b61f2b 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Any, Callable, Optional, cast
+from collections.abc import Callable
+from typing import Any, cast
 
 import torch
 from torch.nn import Parameter
@@ -25,11 +26,11 @@ __all__ = ["QuarkW8A8Fp8"]
 
 class QuarkW8A8Fp8(QuarkScheme):
     def __init__(
-        self, weight_config: dict[str, Any], input_config: Optional[dict[str, Any]]
+        self, weight_config: dict[str, Any], input_config: dict[str, Any] | None
     ):
         self.weight_qscheme = cast(str, weight_config.get("qscheme"))
         self.is_static_input_scheme: bool = False
-        self.input_qscheme: Optional[str] = None
+        self.input_qscheme: str | None = None
         if input_config is not None:
             self.is_static_input_scheme = not cast(bool, input_config.get("is_dynamic"))
             self.input_qscheme = cast(str, input_config.get("qscheme"))
@@ -166,7 +167,7 @@ class QuarkW8A8Fp8(QuarkScheme):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return self.fp8_linear.apply(
             input=x,
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
index c41dd05d10629..42d2ed2e85ed9 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 
@@ -27,8 +27,8 @@ class QuarkW8A8Int8(QuarkScheme):
     def __init__(
         self,
         qscheme: str,
-        is_static_input_scheme: Optional[bool],
-        input_symmetric: Optional[bool],
+        is_static_input_scheme: bool | None,
+        input_symmetric: bool | None,
     ):
         self.qscheme = qscheme
         self.is_static_input_scheme = is_static_input_scheme
@@ -134,6 +134,6 @@ class QuarkW8A8Int8(QuarkScheme):
         self.kernel.process_weights_after_loading(layer)
 
     def apply_weights(
-        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None
     ) -> torch.Tensor:
         return self.kernel.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
index 0eb4b20a6e52c..dc82f94ebbbfb 100644
--- a/vllm/model_executor/layers/quantization/quark/utils.py
+++ b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -3,7 +3,7 @@
 
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
-from typing import Any, Optional
+from typing import Any
 
 import regex as re
 
@@ -22,7 +22,7 @@ def deep_compare(dict1: Any, dict2: Any) -> bool:
 
 
 def should_ignore_layer(
-    layer_name: Optional[str],
+    layer_name: str | None,
     ignore: Iterable[str],
     fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
 ) -> bool:
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index e0070e207048f..3d5dd6c497436 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -3,7 +3,8 @@
 # Copyright © 2025, Oracle and/or its affiliates.
 
 import os
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 import torch.nn.functional as F
@@ -226,7 +227,7 @@ class RTNLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         qweight = layer.weight
         scale = layer.scale
@@ -320,7 +321,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
-    ) -> Optional[FusedMoEQuantConfig]:
+    ) -> FusedMoEQuantConfig | None:
         weight_bits = self.quant_config.weight_bits
         group_size = self.quant_config.group_size
         assert weight_bits == 4 or weight_bits == 8
@@ -345,21 +346,21 @@ class RTNMoEMethod(FusedMoEMethodBase):
         top_k: int,
         renormalize: bool,
         use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
+        topk_group: int | None = None,
+        num_expert_group: int | None = None,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
+        expert_map: torch.Tensor | None = None,
+        custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
         routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: Optional[torch.Tensor] = None,
+        e_score_correction_bias: torch.Tensor | None = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
         enable_eplb: bool = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        expert_load_view: torch.Tensor | None = None,
+        logical_to_physical_map: torch.Tensor | None = None,
+        logical_replica_count: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.fused_experts is None
 
         if enable_eplb:
diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py
index 9396da0ecd1a0..669bd9d6ed837 100644
--- a/vllm/model_executor/layers/quantization/schema.py
+++ b/vllm/model_executor/layers/quantization/schema.py
@@ -13,8 +13,6 @@ possible on ROCm), the model can be optionally augmented with KV cache
 scaling factors.
 """
 
-from typing import Optional
-
 from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
 
 
@@ -75,7 +73,7 @@ class QuantParamSchema(BaseModel):
     # TODO: Generalize and extend with more fields
     # (e.g. weights/activations params) once functionality is enabled
     model_config = ConfigDict(protected_namespaces=())
-    model_type: Optional[str]
+    model_type: str | None
     kv_cache: KVCacheQuantSchema
 
     @model_validator(mode="after")
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
index 6f076401ac32e..f42c45dae76d2 100644
--- a/vllm/model_executor/layers/quantization/torchao.py
+++ b/vllm/model_executor/layers/quantization/torchao.py
@@ -63,7 +63,7 @@ class TorchAOConfig(QuantizationConfig):
     def __init__(
         self,
         torchao_config,
-        skip_modules: Optional[list[str]] = None,
+        skip_modules: list[str] | None = None,
         is_checkpoint_torchao_serialized: bool = False,
     ) -> None:
         """
@@ -301,7 +301,7 @@ class TorchAOLinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return F.linear(x, layer.weight, bias)
 
diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
index a24cd41659a0e..64bfa8fb80eb2 100644
--- a/vllm/model_executor/layers/quantization/tpu_int8.py
+++ b/vllm/model_executor/layers/quantization/tpu_int8.py
@@ -119,7 +119,7 @@ class TPUInt8LinearMethod(LinearMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         try:
             import torch_xla.experimental.custom_kernel  # noqa: F401
diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
index 4b7a22a266533..62a4f90366887 100644
--- a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 from packaging import version
@@ -29,7 +28,7 @@ BITBLAS_SUPPORTED_SYM = [False, True]
 # Determines the supported quantization types for BitBLAS based on the
 # device's capability and whether zero-point (zp) is used.
 def query_bitblas_supported_quant_types(
-    has_zp: bool, device_capability: Optional[int] = None
+    has_zp: bool, device_capability: int | None = None
 ):
     if device_capability is None:
         capability_tuple = current_platform.get_device_capability()
@@ -52,10 +51,10 @@ def query_bitblas_supported_quant_types(
 
 def _check_bitblas_supported(
     quant_type: ScalarType,
-    group_size: Optional[int],
+    group_size: int | None,
     has_zp: bool,
-    device_capability: Optional[int] = None,
-) -> tuple[bool, Optional[str]]:
+    device_capability: int | None = None,
+) -> tuple[bool, str | None]:
     if device_capability is None:
         capability_tuple = current_platform.get_device_capability()
         device_capability = (
@@ -99,7 +98,7 @@ def check_bitblas_supported(
     quant_type: ScalarType,
     group_size: int,
     has_zp: bool = False,
-    device_capability: Optional[int] = None,
+    device_capability: int | None = None,
 ) -> bool:
     cond, _ = _check_bitblas_supported(
         quant_type, group_size, has_zp, device_capability
@@ -156,7 +155,7 @@ def check_bitblas_supports_shape(
     input_size_per_partition: int,
     input_size: int,
     group_size: int,
-) -> tuple[bool, Optional[str]]:
+) -> tuple[bool, str | None]:
     try:
         verify_bitblas_supports_shape(
             output_size_per_partition, input_size_per_partition, input_size, group_size
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 7059a029ba67e..1c6b5de83b2ba 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility helpers for NVFP4 + FlashInfer fused-MoE path"""
 
-from __future__ import annotations
-
 import torch
 
 import vllm.envs as envs
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 7f32ef00647ca..8fce7235bdded 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from enum import Enum
-from typing import Optional
 
 import torch
 
@@ -101,10 +100,10 @@ def apply_flashinfer_per_tensor_scale_fp8(
     layer: torch.nn.Module,
     hidden_states: torch.Tensor,
     router_logits: torch.Tensor,
-    routing_bias: Optional[torch.Tensor],
+    routing_bias: torch.Tensor | None,
     top_k: int,
-    num_expert_group: Optional[int],
-    topk_group: Optional[int],
+    num_expert_group: int | None,
+    topk_group: int | None,
     global_num_experts: int,
     apply_router_weight_on_input: bool,
 ) -> torch.Tensor:
@@ -186,7 +185,7 @@ def register_moe_scaling_factors(layer: torch.nn.Module) -> None:
 
 
 def build_flashinfer_fp8_cutlass_moe_prepare_finalize(
-    moe: Optional[FusedMoEConfig],
+    moe: FusedMoEConfig | None,
 ) -> mk.FusedMoEPrepareAndFinalize:
     """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel"""
     use_dp = moe.moe_parallel_config.dp_size > 1 if moe is not None else False
@@ -194,9 +193,9 @@ def build_flashinfer_fp8_cutlass_moe_prepare_finalize(
 
 
 def select_cutlass_fp8_gemm_impl(
-    moe: Optional[FusedMoEConfig],
+    moe: FusedMoEConfig | None,
     quant_config: FusedMoEQuantConfig,
-    out_dtype: Optional[torch.dtype] = None,
+    out_dtype: torch.dtype | None = None,
 ) -> mk.FusedMoEPermuteExpertsUnpermute:
     """Return a GEMM *experts* implementation for fused-MoE layers"""
 
@@ -225,7 +224,7 @@ def flashinfer_cutlass_moe_fp8(
     inplace: bool = False,
     activation: str = "silu",
     global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
+    expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
 ) -> torch.Tensor:
     quant_config = layer.quant_method.get_fused_moe_quant_config(layer)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index fa34dba371e81..51af40a119147 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -5,8 +5,8 @@
 import functools
 import json
 import os
-from collections.abc import Sequence
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable, Sequence
+from typing import Any
 
 import torch
 
@@ -39,7 +39,7 @@ from vllm.utils.deep_gemm import (
 logger = init_logger(__name__)
 
 
-def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
+def is_fp8(x: torch.dtype | torch.Tensor) -> bool:
     if isinstance(x, torch.Tensor):
         x = x.dtype
     return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz
@@ -54,7 +54,7 @@ def cutlass_scaled_mm(
     Bs: torch.Tensor,
     block_size: list[int],
     output_dtype: torch.dtype = torch.float16,
-    is_hopper: Optional[bool] = None,
+    is_hopper: bool | None = None,
 ) -> torch.Tensor:
     if is_hopper is None:
         is_hopper = current_platform.is_device_capability(90)
@@ -279,8 +279,8 @@ class W8A8BlockFp8LinearOp:
         input: torch.Tensor,
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
-        input_scale: Optional[torch.Tensor] = None,
-        bias: Optional[torch.Tensor] = None,
+        input_scale: torch.Tensor | None = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         assert input_scale is None
         # View input as 2D matrix for fp8 methods
@@ -394,7 +394,7 @@ class W8A8BlockFp8LinearOp:
             ],
             torch.Tensor,
         ],
-        Optional[QuantFP8],
+        QuantFP8 | None,
     ]:
         if use_cutlass:
             return self._run_cutlass, (
@@ -418,7 +418,7 @@ class W8A8BlockFp8LinearOp:
 
 
 def input_to_float8(
-    x: torch.Tensor, dtype: Optional[torch.dtype] = None
+    x: torch.Tensor, dtype: torch.dtype | None = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """This function quantizes input values to float8 values "
     "with tensor-wise quantization."""
@@ -568,10 +568,10 @@ def per_token_group_quant_fp8(
     x: torch.Tensor,
     group_size: int,
     eps: float = 1e-10,
-    dtype: Optional[torch.dtype] = None,
+    dtype: torch.dtype | None = None,
     column_major_scales: bool = False,
-    out_q: Optional[torch.Tensor] = None,
-    use_ue8m0: Optional[bool] = None,
+    out_q: torch.Tensor | None = None,
+    use_ue8m0: bool | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Function to perform per-token-group quantization on an input tensor `x`.
     It converts the tensor values into signed float8 values and returns the
@@ -754,7 +754,7 @@ def _w8a8_triton_block_scaled_mm(
 @functools.lru_cache
 def get_w8a8_block_fp8_configs(
     N: int, K: int, block_n: int, block_k: int
-) -> Optional[dict[int, Any]]:
+) -> dict[int, Any] | None:
     """
     Return optimized configurations for the w8a8 block fp8 kernel.
     The return value will be a dictionary that maps an irregular grid of
@@ -1012,7 +1012,7 @@ def validate_fp8_block_shape(
 def create_fp8_weight_parameter(
     output_size_per_partition: int,
     input_size_per_partition: int,
-    weight_loader: Optional[Callable],
+    weight_loader: Callable | None,
 ) -> torch.nn.Parameter:
     """Create FP8 weight parameter."""
     from vllm.model_executor.parameter import ModelWeightParameter
@@ -1033,8 +1033,8 @@ def create_fp8_scale_parameter(
     parameter_type: torch.nn.Parameter,
     output_partition_sizes: list[int],
     input_size_per_partition: int,
-    block_size: Optional[list[int]],
-    weight_loader: Optional[Callable],
+    block_size: list[int] | None,
+    weight_loader: Callable | None,
 ) -> torch.nn.Parameter:
     """Create scale parameter based on quantization strategy."""
     if parameter_type == ChannelQuantScaleParameter:
@@ -1070,7 +1070,7 @@ def create_fp8_scale_parameter(
 
 
 def create_fp8_input_scale(
-    output_partition_sizes: list[int], weight_loader: Optional[Callable]
+    output_partition_sizes: list[int], weight_loader: Callable | None
 ) -> torch.nn.Parameter:
     """Create input scale parameter for static activation quantization."""
     from vllm.model_executor.parameter import PerTensorScaleParameter
@@ -1087,8 +1087,8 @@ def process_fp8_weight_tensor_strategy(
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
     logical_widths: list[int],
-    input_scale: Optional[torch.Tensor] = None,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    input_scale: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     """Process weights for tensor-wise quantization strategy."""
     from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
         normalize_e4m3fn_to_e4m3fnuz,
@@ -1114,8 +1114,8 @@ def process_fp8_weight_tensor_strategy(
 def process_fp8_weight_channel_strategy(
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    input_scale: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     """Process weights for channel-wise quantization strategy."""
     from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
         normalize_e4m3fn_to_e4m3fnuz,
diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
index 6209dda955ce7..dfebeca933920 100644
--- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -4,7 +4,7 @@ from collections.abc import Mapping
 from copy import deepcopy
 from fractions import Fraction
 from types import MappingProxyType
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 import regex as re
 import torch
@@ -25,7 +25,7 @@ else:
 
 # Match dynamic rules with module name (prefix) and override quantize
 # config if module (prefix) matches a rule
-def override_config(config: Union[GPTQConfig, GPTQMarlinConfig], prefix: str):
+def override_config(config: GPTQConfig | GPTQMarlinConfig, prefix: str):
     weight_bits = get_dynamic_override(config, prefix, "bits", config.weight_bits)
     if isinstance(weight_bits, int):
         config.weight_bits = weight_bits
@@ -60,11 +60,11 @@ def override_config(config: Union[GPTQConfig, GPTQMarlinConfig], prefix: str):
 
 
 def get_dynamic_override(
-    config: Union[GPTQConfig, GPTQMarlinConfig],
+    config: GPTQConfig | GPTQMarlinConfig,
     layer_name: str,
-    key: Optional[str] = None,
-    default_value: Union[int, bool, None] = None,
-) -> Union[dict, int, bool, None]:
+    key: str | None = None,
+    default_value: int | bool | None = None,
+) -> dict | int | bool | None:
     for pattern, pattern_dict in config.dynamic.items():
         # Negative match: matched modules are excluded from quantized init
         if pattern.startswith("-:"):
@@ -126,7 +126,7 @@ def is_layer_gptq_quantized(
 
 
 def get_linear_quant_method(
-    config: Union[GPTQConfig, GPTQMarlinConfig],
+    config: GPTQConfig | GPTQMarlinConfig,
     layer: torch.nn.Module,
     prefix: str,
     linear_method_cls: type,
diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py
index 1b8efe4332c54..925d0a516ce63 100644
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@@ -6,7 +6,7 @@ import functools
 import json
 import logging
 import os
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
@@ -21,8 +21,8 @@ def apply_w8a8_block_int8_linear(
     weight: torch.Tensor,
     block_size: list[int],
     weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
+    input_scale: torch.Tensor | None = None,
+    bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     assert input_scale is None
     # View input as 2D matrix for fp8 methods
@@ -359,7 +359,7 @@ def _w8a8_block_int8_matmul(
 @functools.lru_cache
 def get_w8a8_block_int8_configs(
     N: int, K: int, block_n: int, block_k: int
-) -> Optional[dict[int, Any]]:
+) -> dict[int, Any] | None:
     """
     Return optimized configurations for the w8a8 block fp8 kernel.
 
diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py
index 4bf31340a2f68..3b8c9a8b6ca1f 100644
--- a/vllm/model_executor/layers/quantization/utils/layer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/layer_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Union
 
 import torch
 
@@ -21,7 +20,7 @@ def update_tensor_inplace(dst: torch.Tensor, src: torch.Tensor):
 # Newly generated tensors need to replace existing tensors that are
 # already registered as parameters by vLLM (and won't be freed)
 def replace_parameter(
-    mod: torch.nn.Module, name: str, new: Union[torch.Tensor, torch.nn.Parameter]
+    mod: torch.nn.Module, name: str, new: torch.Tensor | torch.nn.Parameter
 ) -> None:
     old = getattr(mod, name)
     if (
diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py
index 69466bdcb64c2..ccfcdac1ec0fe 100644
--- a/vllm/model_executor/layers/quantization/utils/machete_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -41,7 +40,7 @@ def query_machete_supported_group_sizes(act_type: torch.dtype) -> list[int]:
 
 def check_machete_supports_shape(
     in_features: int, out_featrues: int
-) -> tuple[bool, Optional[str]]:
+) -> tuple[bool, str | None]:
     if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0:
         return (
             False,
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index d2fa5af1b8540..fd6b581d2b90a 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import numpy
 import torch
@@ -34,9 +33,9 @@ USE_FP32_REDUCE_DEFAULT = True
 #  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
 #  TODO: we may want to move this into the C++ so its closer to the actual impl
 def query_marlin_supported_quant_types(
-    has_zp: Optional[bool] = None,
+    has_zp: bool | None = None,
     include_fp_type: bool = True,
-    device_capability: Optional[int] = None,
+    device_capability: int | None = None,
 ):
     if device_capability is None:
         capability_tuple = current_platform.get_device_capability()
@@ -72,10 +71,10 @@ def query_marlin_supported_quant_types(
 
 def _check_marlin_supported(
     quant_type: ScalarType,
-    group_size: Optional[int],
+    group_size: int | None,
     has_zp: bool,
-    device_capability: Optional[int] = None,
-) -> tuple[bool, Optional[str]]:
+    device_capability: int | None = None,
+) -> tuple[bool, str | None]:
     if device_capability is None:
         capability_tuple = current_platform.get_device_capability()
         device_capability = (
@@ -109,7 +108,7 @@ def check_marlin_supported(
     quant_type: ScalarType,
     group_size: int,
     has_zp: bool = False,
-    device_capability: Optional[int] = None,
+    device_capability: int | None = None,
 ) -> bool:
     cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
     return cond
@@ -164,7 +163,7 @@ def check_marlin_supports_shape(
     input_size_per_partition: int,
     input_size: int,
     group_size: int,
-) -> tuple[bool, Optional[str]]:
+) -> tuple[bool, str | None]:
     try:
         verify_marlin_supports_shape(
             output_size_per_partition, input_size_per_partition, input_size, group_size
@@ -445,7 +444,7 @@ def apply_gptq_marlin_linear(
     output_size_per_partition: int,
     input_size_per_partition: int,
     is_k_full: bool,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
 ) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
@@ -494,7 +493,7 @@ def apply_awq_marlin_linear(
     quant_type: ScalarType,
     output_size_per_partition: int,
     input_size_per_partition: int,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
 ) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index c5e34f392fb22..842fb9b62267a 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -95,11 +94,11 @@ def apply_fp4_marlin_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
-    weight_scale_2: Optional[torch.Tensor],
+    weight_scale_2: torch.Tensor | None,
     workspace: torch.Tensor,
     size_n: int,
     size_k: int,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
 ) -> torch.Tensor:
     # For GPUs that lack FP4 hardware support, we can leverage the
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 9348ac158daa7..8c96848a85397 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -45,7 +44,7 @@ def apply_fp8_marlin_linear(
     workspace: torch.Tensor,
     size_n: int,
     size_k: int,
-    bias: Optional[torch.Tensor],
+    bias: torch.Tensor | None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
 ) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 1bbd88d5ca710..89756c45ef556 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility functions used for tests and benchmarks"""
 
-from typing import Optional
-
 import numpy as np
 import torch
 
@@ -100,7 +98,7 @@ def marlin_quantize(
     quant_type: ScalarType,
     group_size: int,
     act_order: bool,
-    test_perm: Optional[torch.Tensor] = None,
+    test_perm: torch.Tensor | None = None,
 ):
     size_k, size_n = w.shape
     num_bits = quant_type.size_bits
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index ee6c826f8b2c5..231d7dc6ce41b 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any
 
 import torch
 
@@ -71,17 +72,17 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
 
 def _can_support_mxfp4(
     use_grouped_topk: bool = False,
-    topk_group: Optional[int] = None,
-    num_expert_group: Optional[int] = None,
-    expert_map: Optional[torch.Tensor] = None,
-    custom_routing_function: Optional[Callable] = None,
-    e_score_correction_bias: Optional[torch.Tensor] = None,
+    topk_group: int | None = None,
+    num_expert_group: int | None = None,
+    expert_map: torch.Tensor | None = None,
+    custom_routing_function: Callable | None = None,
+    e_score_correction_bias: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
     scoring_func: str = "softmax",
     activation: str = "swigluoai",
-    expert_load_view: Optional[torch.Tensor] = None,
-    logical_to_physical_map: Optional[torch.Tensor] = None,
-    logical_replica_count: Optional[torch.Tensor] = None,
+    expert_load_view: torch.Tensor | None = None,
+    logical_to_physical_map: torch.Tensor | None = None,
+    logical_replica_count: torch.Tensor | None = None,
 ):
     return not (
         use_grouped_topk
diff --git a/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py b/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py
index 3c71441a3df7e..7752324f41fee 100644
--- a/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from enum import Enum
-from typing import Union
 
 from vllm.logger import init_logger
 
@@ -28,9 +27,7 @@ class OCP_MX_Scheme(str, Enum):
     w_mxfp6_e2m3_a_mxfp6_e2m3 = "w_mxfp6_e2m3_a_mxfp6_e2m3"
 
     @classmethod
-    def from_quant_dtype(
-        cls, input_dtype: Union[str, None], weight_dtype: Union[str, None]
-    ):
+    def from_quant_dtype(cls, input_dtype: str | None, weight_dtype: str | None):
         if input_dtype not in OCP_MX_DTYPES or weight_dtype not in OCP_MX_DTYPES:
             return None
         elif input_dtype == "mxfp4" and weight_dtype == "mxfp4":
diff --git a/vllm/model_executor/layers/quantization/utils/petit_utils.py b/vllm/model_executor/layers/quantization/utils/petit_utils.py
index 1f053103fc3c6..081f53eac9390 100644
--- a/vllm/model_executor/layers/quantization/utils/petit_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/petit_utils.py
@@ -43,8 +43,8 @@ _require_petit = _import_petit_kernel
 
 
 def _check_petit_nvfp4_supported(
-    quant_method: str, group_size: Optional[int]
-) -> tuple[bool, Optional[str]]:
+    quant_method: str, group_size: int | None
+) -> tuple[bool, str | None]:
     if quant_method != "NVFP4":
         return (
             False,
@@ -62,7 +62,7 @@ def _check_petit_nvfp4_supported(
     return (True, None)
 
 
-def verify_petit_nvfp4_supported(quant_method: str, group_size: Optional[int]) -> None:
+def verify_petit_nvfp4_supported(quant_method: str, group_size: int | None) -> None:
     supported, error_msg = _check_petit_nvfp4_supported(quant_method, group_size)
     if not supported:
         assert error_msg is not None
@@ -98,7 +98,7 @@ def apply_petit_nvfp4_linear(
     weight_scale_2: torch.Tensor,
     size_n: int,
     size_k: int,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     # Trigger (or get) the import here as well.
     petit_kernel = _import_petit_kernel()
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 2e9b279465f93..c2ecf4c02828d 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -5,7 +5,7 @@
 from collections.abc import Mapping
 from dataclasses import dataclass
 from types import MappingProxyType
-from typing import ClassVar, NamedTuple, Optional
+from typing import ClassVar, NamedTuple
 
 import numpy
 import torch
@@ -91,7 +91,7 @@ class QuantKey:
 
     dtype: torch.dtype
     scale: ScaleDesc
-    scale2: Optional[ScaleDesc] = None
+    scale2: ScaleDesc | None = None
     symmetric: bool = True
 
     def __str__(self):
@@ -205,7 +205,7 @@ def scaled_quantize(
 def scaled_dequantize(
     x_q: torch.Tensor,
     x_s: torch.Tensor,
-    group_shape: Optional[GroupShape] = None,
+    group_shape: GroupShape | None = None,
     out_dtype: torch.dtype = torch.float32,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     if group_shape is not None:
@@ -336,7 +336,7 @@ def permute_rows(
     q_w: torch.Tensor,
     w_ref: torch.Tensor,
     group_size: int,
-    test_perm: Optional[torch.Tensor] = None,
+    test_perm: torch.Tensor | None = None,
 ):
     assert q_w.shape == w_ref.shape
 
@@ -365,7 +365,7 @@ def permute_rows(
 def quantize_weights(
     w: torch.Tensor,
     quant_type: ScalarType,
-    group_size: Optional[int],
+    group_size: int | None,
     zero_points: bool = False,
     ref_zero_points_after_scales: bool = False,
 ):
@@ -466,7 +466,7 @@ def gptq_quantize_weights(
     quant_type: ScalarType,
     group_size: int,
     act_order: bool,
-    test_perm: Optional[torch.Tensor] = None,
+    test_perm: torch.Tensor | None = None,
 ):
     size_k, _ = w.shape
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index c26cd4f28cb69..44feb24a1eefc 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import torch
 from packaging import version
@@ -75,7 +75,7 @@ CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
 
 
 def per_tensor_dequantize(
-    tensor: torch.Tensor, inv_scale: Union[float, torch.Tensor]
+    tensor: torch.Tensor, inv_scale: float | torch.Tensor
 ) -> torch.Tensor:
     fake_qweight = tensor.to(torch.float16)
     dq_weight = fake_qweight * inv_scale
@@ -399,7 +399,7 @@ class Fp8LinearOp:
         self,
         act_quant_static: bool,
         act_quant_group_shape: GroupShape = GroupShape.PER_TENSOR,
-        pad_output: Optional[bool] = None,
+        pad_output: bool | None = None,
     ):
         if current_platform.is_rocm():
             self.preferred_backend = "rocm"
@@ -437,10 +437,10 @@ class Fp8LinearOp:
         input: torch.Tensor,
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
-        out_dtype: Optional[torch.dtype] = None,
-        input_scale: Optional[torch.Tensor] = None,
-        input_scale_ub: Optional[torch.Tensor] = None,
-        bias: Optional[torch.Tensor] = None,
+        out_dtype: torch.dtype | None = None,
+        input_scale: torch.Tensor | None = None,
+        input_scale_ub: torch.Tensor | None = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         # ops.scaled_fp8_quant supports both dynamic and static quant.
         #   If dynamic, layer.input_scale is None and x_scale computed from x.
@@ -486,8 +486,8 @@ class Fp8LinearOp:
 def normalize_e4m3fn_to_e4m3fnuz(
     weight: torch.Tensor,
     weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    input_scale: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
     assert weight.dtype == torch.float8_e4m3fn
     # The bits pattern 10000000(-128) represents zero in e4m3fn
     # but NaN in e4m3fnuz. So here we set it to 0.
diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py
index 8dc237f8232d7..c9fa8054625e4 100644
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@@ -34,8 +34,8 @@ Example models: Qwen (Qwen-VL), MiniCPM-V 2.0
 """
 
 import math
+from collections.abc import Callable
 from functools import partial
-from typing import Callable, Optional, Union
 
 import numpy as np
 import torch
@@ -48,9 +48,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
 
 
-def get_abs_pos(
-    abs_pos: torch.Tensor, tgt_size: Union[torch.Tensor, int]
-) -> torch.Tensor:
+def get_abs_pos(abs_pos: torch.Tensor, tgt_size: torch.Tensor | int) -> torch.Tensor:
     # abs_pos: L, C
     # tgt_size: (H, W)
     # return: M, C
@@ -124,7 +122,7 @@ def get_2d_sincos_pos_embed_from_grid(
 
 def get_2d_sincos_pos_embed(
     embed_dim: int,
-    grid_size: Union[int, tuple[int, int]],
+    grid_size: int | tuple[int, int],
     cls_token: bool = False,
     version: tuple[int, int] = (2, 0),
 ) -> torch.Tensor:
@@ -168,10 +166,10 @@ class BaseResampler(nn.Module):
         num_queries: int,
         embed_dim: int,
         num_heads: int,
-        kv_dim: Optional[int] = None,
+        kv_dim: int | None = None,
         norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
         do_post_projection: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -222,11 +220,11 @@ class Resampler2(BaseResampler):
         grid_size: int,
         embed_dim: int,
         num_heads: int,
-        kv_dim: Optional[int] = None,
+        kv_dim: int | None = None,
         norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
         adaptive: bool = False,
         do_post_projection: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__(
@@ -250,8 +248,8 @@ class Resampler2(BaseResampler):
     def forward(
         self,
         x: torch.Tensor,
-        tgt_sizes: Optional[torch.Tensor] = None,
-        attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: torch.Tensor | None = None,
+        attn_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if tgt_sizes is None:
             tgt_sizes = int(math.sqrt(x.size(1)))
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index e6956de4bfaaa..64187c97cab7e 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Rotary Positional Embeddings."""
 
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
@@ -28,10 +28,10 @@ def get_rope(
     max_position: int,
     base: float,
     is_neox_style: bool = True,
-    rope_scaling: Optional[dict[str, Any]] = None,
-    dtype: Optional[torch.dtype] = None,
+    rope_scaling: dict[str, Any] | None = None,
+    dtype: torch.dtype | None = None,
     partial_rotary_factor: float = 1.0,
-    dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+    dual_chunk_attention_config: dict[str, Any] | None = None,
 ) -> RotaryEmbedding:
     if dtype is None:
         dtype = torch.get_default_dtype()
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index cf50b60118b9b..17cd39bb8cd63 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Rotary Positional Embeddings Base Class."""
 
-from typing import Optional
-
 import torch
 
 from vllm.model_executor.custom_op import CustomOp
@@ -92,8 +90,8 @@ class RotaryEmbedding(CustomOp):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """A PyTorch-native implementation of forward()."""
         positions = positions.flatten()
         num_tokens = positions.shape[0]
@@ -121,8 +119,8 @@ class RotaryEmbedding(CustomOp):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         if self.use_flashinfer:
             torch.ops.vllm.flashinfer_rotary_embedding(
                 positions,
@@ -154,8 +152,8 @@ class RotaryEmbedding(CustomOp):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         if self.is_rocm_triton_rotary_embedding_enabled:
             self._match_cos_sin_cache_dtype(query)
             rocm_aiter_rotary_emb(
@@ -177,8 +175,8 @@ class RotaryEmbedding(CustomOp):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         from vllm._ipex_ops import ipex_ops as ops
 
         self._match_cos_sin_cache_dtype(query)
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 124ea0236cbfb..f1b34f1785741 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
+from collections.abc import Callable
 from functools import cache
 from importlib.util import find_spec
-from typing import Callable, Optional
 
 import torch
 
@@ -72,7 +72,7 @@ def apply_rotary_emb_dispatch(
 
 @cache
 def dispatch_rotary_emb_function(
-    default: Optional[Callable[..., torch.Tensor]] = None,
+    default: Callable[..., torch.Tensor] | None = None,
 ) -> Callable[..., torch.Tensor]:
     if current_platform.is_cuda():
         return apply_rotary_emb
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index eaedca9b52192..2e5efec066634 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import Optional
 
 import torch
 
@@ -110,9 +109,9 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """PyTorch-native implementation equivalent to forward()."""
         assert key is not None
         self._match_cos_sin_cache_dtype(query)
@@ -151,7 +150,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         return self.forward_native(positions, query, key, offsets)
diff --git a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
index 0e6eddda772f9..b5dd94cc7f531 100644
--- a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -121,7 +120,7 @@ class DualChunkRotaryEmbedding(CustomOp):
         positions: torch.Tensor,
         query: torch.Tensor,
         key: torch.Tensor,
-        offsets: Optional[torch.Tensor] = None,
+        offsets: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         query = query.view(*query.shape[:-1], -1, self.head_size)
         key = key.view(*key.shape[:-1], -1, self.head_size)
@@ -185,7 +184,7 @@ class DualChunkRotaryEmbedding(CustomOp):
         positions: torch.Tensor,
         query: torch.Tensor,
         key: torch.Tensor,
-        offsets: Optional[torch.Tensor] = None,
+        offsets: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         return self.forward_native(positions, query, key, offsets)
 
diff --git a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
index 2bc0477c5af28..749cdbe88a62e 100644
--- a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -16,8 +15,8 @@ class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert positions.ndim == 1 or positions.ndim == 2
         assert key is not None
 
@@ -71,6 +70,6 @@ class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         return self.forward_native(positions, query, key)
diff --git a/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py
index cbb3ee4e9974b..bb51dcf1c6f50 100644
--- a/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Union
 
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
@@ -65,7 +64,7 @@ class LinearScalingRotaryEmbedding(RotaryEmbedding):
         max_position_embeddings: int,
         base: float,
         is_neox_style: bool,
-        scaling_factors: Union[list[float], float],
+        scaling_factors: list[float] | float,
         dtype: torch.dtype,
     ) -> None:
         if isinstance(scaling_factors, float):
diff --git a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
index 0b808e31c903e..efef8877bcaae 100644
--- a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import Optional
 
 import torch
 
@@ -56,8 +55,8 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding):
     def forward_native(  # type: ignore[override]
         self,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert key is not None
         # self.cos_sin_cache here is complex tensor so we cannot cast into
         # query's dtype directly with self._match_cos_sin_cache_dtype
@@ -76,6 +75,6 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding):
     def forward_cuda(  # type: ignore[override]
         self,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         return self.forward_native(query, key)
diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index fce110e6a5270..5cae3d9b80fa7 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import numpy as np
 import torch
@@ -212,11 +211,11 @@ class MRotaryEmbedding(RotaryEmbedding):
         base: float,
         is_neox_style: bool,
         dtype: torch.dtype,
-        mrope_section: Optional[list[int]] = None,
+        mrope_section: list[int] | None = None,
         mrope_interleaved: bool = False,
         # YaRN parameters.
         *,
-        scaling_factor: Optional[float] = None,
+        scaling_factor: float | None = None,
         extrapolation_factor: float = 1,
         attn_factor: float = 1,
         beta_fast: int = 32,
@@ -265,9 +264,9 @@ class MRotaryEmbedding(RotaryEmbedding):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """PyTorch-native implementation equivalent to forward().
 
         Args:
@@ -318,9 +317,9 @@ class MRotaryEmbedding(RotaryEmbedding):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert positions.ndim == 1 or positions.ndim == 2
         assert key is not None
 
@@ -363,18 +362,18 @@ class MRotaryEmbedding(RotaryEmbedding):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         return self.forward_native(positions, query, key, offsets)
 
     def forward_cpu(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         return self.forward_native(positions, query, key, offsets)
 
     @classmethod
@@ -382,12 +381,12 @@ class MRotaryEmbedding(RotaryEmbedding):
         cls,
         input_tokens: list[int],
         hf_config: PretrainedConfig,
-        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        second_per_grid_ts: Optional[list[float]],
+        image_grid_thw: list[list[int]] | torch.Tensor | None,
+        video_grid_thw: list[list[int]] | torch.Tensor | None,
+        second_per_grid_ts: list[float] | None,
         context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
+        seq_len: int | None = None,
+        audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[list[list[int]], int]:
         """Get mrope input positions and delta value."""
diff --git a/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py
index 560fb100413d1..031a12fceba65 100644
--- a/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 
@@ -21,7 +20,7 @@ class NTKScalingRotaryEmbedding(RotaryEmbedding):
         is_neox_style: bool,
         scaling_factor: float,
         dtype: torch.dtype,
-        mixed_b: Optional[float] = None,
+        mixed_b: float | None = None,
     ) -> None:
         self.scaling_factor = scaling_factor
         self.mixed_b = mixed_b
diff --git a/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py b/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py
index 02ad142d676b7..2a42e3bd00ec8 100644
--- a/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -26,8 +25,8 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         dtype: torch.dtype,
         short_factor: list[float],
         long_factor: list[float],
-        short_mscale: Optional[float] = None,
-        long_mscale: Optional[float] = None,
+        short_mscale: float | None = None,
+        long_mscale: float | None = None,
     ):
         super().__init__()
 
@@ -106,9 +105,9 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert key is not None
         query = query.view(*query.shape[:-1], -1, self.head_size)
         key = key.view(*key.shape[:-1], -1, self.head_size)
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index e522cc450d6bd..87ffcb48c8c02 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility methods for model layers."""
 
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 
@@ -95,13 +95,13 @@ def default_unquantized_gemm(
     layer: torch.nn.Module,
     x: torch.Tensor,
     weight: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
 ):
     return torch.nn.functional.linear(x, weight, bias)
 
 
 def rocm_unquantized_gemm_impl(
-    x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None
+    x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None = None
 ) -> torch.Tensor:
     from vllm.platforms.rocm import on_gfx9
 
@@ -131,7 +131,7 @@ def rocm_unquantized_gemm_impl(
 
 
 def rocm_unquantized_gemm_impl_fake(
-    x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None
+    x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None = None
 ) -> torch.Tensor:
     return x.new_empty((*x.shape[:-1], weight.shape[0]))
 
@@ -140,7 +140,7 @@ def rocm_unquantized_gemm(
     layer: torch.nn.Module,
     x: torch.Tensor,
     weight: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
 ) -> torch.Tensor:
     return torch.ops.vllm.rocm_unquantized_gemm_impl(x, weight, bias)
 
@@ -197,7 +197,7 @@ def cpu_unquantized_gemm(
     layer: torch.nn.Module,
     x: torch.Tensor,
     weight: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
+    bias: torch.Tensor | None = None,
 ):
     return layer.cpu_linear(x, weight, bias)
 
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index b7253c7f0e523..1abc3ad884550 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -3,7 +3,6 @@
 
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -65,7 +64,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
         self,
         layer: torch.nn.Module,
         x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
+        bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
 
@@ -226,10 +225,10 @@ class VocabParallelEmbedding(CustomOp):
         self,
         num_embeddings: int,
         embedding_dim: int,
-        params_dtype: Optional[torch.dtype] = None,
-        org_num_embeddings: Optional[int] = None,
+        params_dtype: torch.dtype | None = None,
+        org_num_embeddings: int | None = None,
         padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -347,7 +346,7 @@ class VocabParallelEmbedding(CustomOp):
             added_vocab_end_index,
         )
 
-    def get_sharded_to_full_mapping(self) -> Optional[list[int]]:
+    def get_sharded_to_full_mapping(self) -> list[int] | None:
         """Get a mapping that can be used to reindex the gathered
         logits for sampling.
 
@@ -515,10 +514,10 @@ class ParallelLMHead(VocabParallelEmbedding):
         num_embeddings: int,
         embedding_dim: int,
         bias: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        org_num_embeddings: Optional[int] = None,
+        params_dtype: torch.dtype | None = None,
+        org_num_embeddings: int | None = None,
         padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__(
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index df0d059594a76..301f2d00bf404 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Literal, Optional
+from typing import Literal
 
 from torch import nn
 
@@ -122,7 +122,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
 
 
 def get_model(
-    *, vllm_config: VllmConfig, model_config: Optional[ModelConfig] = None
+    *, vllm_config: VllmConfig, model_config: ModelConfig | None = None
 ) -> nn.Module:
     loader = get_model_loader(vllm_config.load_config)
     if model_config is None:
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index d41b8ae55ea5f..71df96cb3e9a4 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -6,8 +6,8 @@ import glob
 import itertools
 import math
 import os
-from collections.abc import Generator
-from typing import Any, Callable, Optional
+from collections.abc import Callable, Generator
+from typing import Any
 
 import numpy as np
 import torch
@@ -88,7 +88,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         self,
         model_name_or_path: str,
         allowed_patterns: list[str],
-        revision: Optional[str] = None,
+        revision: str | None = None,
     ) -> tuple[str, list[str], str]:
         """Retrieve weight files. Download the files if necessary.
 
@@ -122,7 +122,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
         raise RuntimeError(f"No model weights found in: `{model_name_or_path}`")
 
     def _prepare_weights(
-        self, model_name_or_path: str, revision: Optional[str]
+        self, model_name_or_path: str, revision: str | None
     ) -> tuple[list[str], bool]:
         """Prepare weight files for the model."""
 
@@ -196,7 +196,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
     def _get_quantized_weights_iterator(
         self,
         model_name_or_path: str,
-        revision: Optional[str],
+        revision: str | None,
     ) -> tuple[Generator[tuple[str, torch.Tensor], None, None], dict[str, Any]]:
         """Get an iterator to the model weights with bitsandbytes quantization,
         as well as the quantization state dictionary."""
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 00944989a002f..c97de1aa45964 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -5,7 +5,7 @@ import glob
 import os
 import time
 from collections.abc import Generator, Iterable
-from typing import Optional, cast
+from typing import cast
 
 import torch
 from torch import nn
@@ -47,7 +47,7 @@ class DefaultModelLoader(BaseModelLoader):
         model_or_path: str
         """The model ID or path."""
 
-        revision: Optional[str]
+        revision: str | None
         """The optional model revision."""
 
         prefix: str = ""
@@ -56,7 +56,7 @@ class DefaultModelLoader(BaseModelLoader):
         fall_back_to_pt: bool = True
         """Whether .pt weights can be used."""
 
-        allow_patterns_overrides: Optional[list[str]] = None
+        allow_patterns_overrides: list[str] | None = None
         """If defined, weights will load exclusively using these patterns."""
 
     counter_before_loading_weights: float = 0.0
@@ -79,9 +79,9 @@ class DefaultModelLoader(BaseModelLoader):
     def _prepare_weights(
         self,
         model_name_or_path: str,
-        revision: Optional[str],
+        revision: str | None,
         fall_back_to_pt: bool,
-        allow_patterns_overrides: Optional[list[str]],
+        allow_patterns_overrides: list[str] | None,
     ) -> tuple[str, list[str], bool]:
         """Prepare weights for the model.
 
diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py
index 50a92edd1162c..079e3168647bb 100644
--- a/vllm/model_executor/model_loader/runai_streamer_loader.py
+++ b/vllm/model_executor/model_loader/runai_streamer_loader.py
@@ -3,7 +3,6 @@
 # ruff: noqa: SIM117
 import os
 from collections.abc import Generator
-from typing import Optional
 
 import torch
 from torch import nn
@@ -51,7 +50,7 @@ class RunaiModelStreamerLoader(BaseModelLoader):
                 os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url
 
     def _prepare_weights(
-        self, model_name_or_path: str, revision: Optional[str]
+        self, model_name_or_path: str, revision: str | None
     ) -> list[str]:
         """Prepare weights for the model.
 
diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index e65eb78819e29..d94dbd9f06e0b 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -5,7 +5,7 @@ import collections
 import glob
 import os
 from collections.abc import Generator
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -89,7 +89,7 @@ class ShardedStateLoader(BaseModelLoader):
                     result[k] = t
         return result
 
-    def _prepare_weights(self, model_name_or_path: str, revision: Optional[str]):
+    def _prepare_weights(self, model_name_or_path: str, revision: str | None):
         if is_s3(model_name_or_path) or os.path.isdir(model_name_or_path):
             return model_name_or_path
         else:
@@ -171,8 +171,8 @@ class ShardedStateLoader(BaseModelLoader):
     def save_model(
         model: torch.nn.Module,
         path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
+        pattern: str | None = None,
+        max_size: int | None = None,
     ) -> None:
         from safetensors.torch import save_file
 
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 9d58278f996b6..ce5c0506979a2 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -12,7 +12,7 @@ import threading
 import time
 from collections.abc import Generator, MutableMapping
 from dataclasses import asdict, dataclass, field, fields
-from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
+from typing import TYPE_CHECKING, Any, ClassVar, Optional
 
 import regex as re
 import torch
@@ -67,7 +67,7 @@ __all__ = [
 logger = init_logger(__name__)
 
 
-def is_valid_deserialization_uri(uri: Optional[str]) -> bool:
+def is_valid_deserialization_uri(uri: str | None) -> bool:
     if uri:
         scheme = uri.lower().split("://")[0]
         return scheme in {"s3", "http", "https"} or os.path.exists(uri)
@@ -156,25 +156,23 @@ class _NoInitOrTensorImpl:
 
 @dataclass
 class TensorizerConfig(MutableMapping):
-    tensorizer_uri: Optional[str] = None
-    tensorizer_dir: Optional[str] = None
-    vllm_tensorized: Optional[bool] = None
-    verify_hash: Optional[bool] = None
-    num_readers: Optional[int] = None
-    encryption_keyfile: Optional[str] = None
-    s3_access_key_id: Optional[str] = None
-    s3_secret_access_key: Optional[str] = None
-    s3_endpoint: Optional[str] = None
-    lora_dir: Optional[str] = None
-    stream_kwargs: Optional[dict[str, Any]] = None
-    serialization_kwargs: Optional[dict[str, Any]] = None
-    deserialization_kwargs: Optional[dict[str, Any]] = None
-    _extra_serialization_attrs: Optional[dict[str, Any]] = field(
-        init=False, default=None
-    )
-    model_class: Optional[type[torch.nn.Module]] = field(init=False, default=None)
-    hf_config: Optional[PretrainedConfig] = field(init=False, default=None)
-    dtype: Optional[Union[str, torch.dtype]] = field(init=False, default=None)
+    tensorizer_uri: str | None = None
+    tensorizer_dir: str | None = None
+    vllm_tensorized: bool | None = None
+    verify_hash: bool | None = None
+    num_readers: int | None = None
+    encryption_keyfile: str | None = None
+    s3_access_key_id: str | None = None
+    s3_secret_access_key: str | None = None
+    s3_endpoint: str | None = None
+    lora_dir: str | None = None
+    stream_kwargs: dict[str, Any] | None = None
+    serialization_kwargs: dict[str, Any] | None = None
+    deserialization_kwargs: dict[str, Any] | None = None
+    _extra_serialization_attrs: dict[str, Any] | None = field(init=False, default=None)
+    model_class: type[torch.nn.Module] | None = field(init=False, default=None)
+    hf_config: PretrainedConfig | None = field(init=False, default=None)
+    dtype: str | torch.dtype | None = field(init=False, default=None)
     _is_sharded: bool = field(init=False, default=False)
     _fields: ClassVar[tuple[str, ...]]
     _keys: ClassVar[frozenset[str]]
@@ -362,9 +360,9 @@ TensorizerConfig._keys = frozenset(TensorizerConfig._fields)
 
 @dataclass
 class TensorizerArgs:
-    tensorizer_uri: Optional[str] = None
-    tensorizer_dir: Optional[str] = None
-    encryption_keyfile: Optional[str] = None
+    tensorizer_uri: str | None = None
+    tensorizer_dir: str | None = None
+    encryption_keyfile: str | None = None
 
     def __init__(self, tensorizer_config: TensorizerConfig):
         for k, v in tensorizer_config.items():
@@ -621,7 +619,7 @@ def is_vllm_tensorized(tensorizer_config: "TensorizerConfig") -> bool:
 
 
 def serialize_extra_artifacts(
-    tensorizer_args: TensorizerArgs, served_model_name: Union[str, list[str], None]
+    tensorizer_args: TensorizerArgs, served_model_name: str | list[str] | None
 ) -> None:
     if not isinstance(served_model_name, str):
         raise ValueError(
diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py
index 5585a74f8926e..ba72d576babc4 100644
--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -3,7 +3,6 @@
 # ruff: noqa: SIM117
 import copy
 from collections.abc import Generator
-from typing import Union
 
 import torch
 from torch import nn
@@ -140,7 +139,7 @@ class TensorizerLoader(BaseModelLoader):
     @staticmethod
     def save_model(
         model: torch.nn.Module,
-        tensorizer_config: Union[TensorizerConfig, dict],
+        tensorizer_config: TensorizerConfig | dict,
         model_config: ModelConfig,
     ) -> None:
         if isinstance(tensorizer_config, dict):
diff --git a/vllm/model_executor/model_loader/tpu.py b/vllm/model_executor/model_loader/tpu.py
index fc97003de8e3c..ec42e3a1ea26b 100644
--- a/vllm/model_executor/model_loader/tpu.py
+++ b/vllm/model_executor/model_loader/tpu.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -30,7 +29,7 @@ class TPUModelLoader(DefaultModelLoader):
         self,
         vllm_config: VllmConfig,
         model_config: ModelConfig,
-        mesh: Optional[xs.Mesh] = None,
+        mesh: xs.Mesh | None = None,
     ) -> nn.Module:
         # Initialize model and load weights on CPU. Then, during SPMD partition,
         # weights are sharded and transferred to TPUs.
@@ -90,7 +89,7 @@ class TPUModelLoader(DefaultModelLoader):
             )
         return model
 
-    def _check_model_is_loaded(self, mesh: Optional[xs.Mesh], model: nn.Module) -> None:
+    def _check_model_is_loaded(self, mesh: xs.Mesh | None, model: nn.Module) -> None:
         """
         Ensure the model is properly loaded.
         1. All model parameters and buffers are on XLA device.
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 5ae32f1d120c0..c68ac611558a4 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -7,7 +7,6 @@ import inspect
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field
-from typing import Optional
 
 import torch
 from torch import nn
@@ -46,8 +45,8 @@ def initialize_model(
     vllm_config: VllmConfig,
     *,
     prefix: str = "",
-    model_class: Optional[type[nn.Module]] = None,
-    model_config: Optional[ModelConfig] = None,
+    model_class: type[nn.Module] | None = None,
+    model_config: ModelConfig | None = None,
 ) -> nn.Module:
     """Initialize a model with the given configurations."""
     if model_config is None:
@@ -268,7 +267,7 @@ class ParamMapping:
                     index,
                 )
 
-    def get_sub_modules(self, module_name: str) -> Optional[tuple[str, list[str]]]:
+    def get_sub_modules(self, module_name: str) -> tuple[str, list[str]] | None:
         for key, value in self.packed_mapping.items():
             if module_name.endswith(key):
                 return key, value
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 5f83482bec3a0..c2d68029f4c71 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -11,10 +11,10 @@ import os
 import tempfile
 import time
 from collections import defaultdict
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from contextlib import contextmanager
 from pathlib import Path
-from typing import IO, Any, Callable, Optional, Union
+from typing import IO, Any
 
 import filelock
 import huggingface_hub.constants
@@ -85,7 +85,7 @@ class DisabledTqdm(tqdm):
         super().__init__(*args, **kwargs, disable=True)
 
 
-def get_lock(model_name_or_path: Union[str, Path], cache_dir: Optional[str] = None):
+def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
     lock_dir = cache_dir or temp_dir
     model_name_or_path = str(model_name_or_path)
     os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
@@ -100,7 +100,7 @@ def get_lock(model_name_or_path: Union[str, Path], cache_dir: Optional[str] = No
 
 @contextmanager
 def atomic_writer(
-    filepath: Union[str, Path], mode: str = "w", encoding: Optional[str] = None
+    filepath: str | Path, mode: str = "w", encoding: str | None = None
 ) -> Generator[IO]:
     """
     Context manager that provides an atomic file writing routine.
@@ -143,11 +143,11 @@ def atomic_writer(
 
 def maybe_download_from_modelscope(
     model: str,
-    revision: Optional[str] = None,
-    download_dir: Optional[str] = None,
-    ignore_patterns: Optional[Union[str, list[str]]] = None,
-    allow_patterns: Optional[Union[list[str], str]] = None,
-) -> Optional[str]:
+    revision: str | None = None,
+    download_dir: str | None = None,
+    ignore_patterns: str | list[str] | None = None,
+    allow_patterns: list[str] | str | None = None,
+) -> str | None:
     """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True.
 
     Returns the path to the downloaded model, or None if the model is not
@@ -370,10 +370,10 @@ def get_sparse_attention_config(
 
 def download_weights_from_hf(
     model_name_or_path: str,
-    cache_dir: Optional[str],
+    cache_dir: str | None,
     allow_patterns: list[str],
-    revision: Optional[str] = None,
-    ignore_patterns: Optional[Union[str, list[str]]] = None,
+    revision: str | None = None,
+    ignore_patterns: str | list[str] | None = None,
 ) -> str:
     """Download model weights from Hugging Face Hub.
 
@@ -448,8 +448,8 @@ def download_weights_from_hf(
 def download_safetensors_index_file_from_hf(
     model_name_or_path: str,
     index_file: str,
-    cache_dir: Optional[str],
-    revision: Optional[str] = None,
+    cache_dir: str | None,
+    revision: str | None = None,
 ) -> None:
     """Download hf safetensors index file from Hugging Face Hub.
 
@@ -540,7 +540,7 @@ def enable_tqdm(use_tqdm_on_load: bool):
 
 def np_cache_weights_iterator(
     model_name_or_path: str,
-    cache_dir: Optional[str],
+    cache_dir: str | None,
     hf_folder: str,
     hf_weights_files: list[str],
     use_tqdm_on_load: bool,
@@ -746,7 +746,7 @@ def fastsafetensors_weights_iterator(
 def pt_weights_iterator(
     hf_weights_files: list[str],
     use_tqdm_on_load: bool,
-    pt_load_map_location: Union[str, dict[str, str]] = "cpu",
+    pt_load_map_location: str | dict[str, str] = "cpu",
 ) -> Generator[tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model bin/pt files."""
     for bin_file in tqdm(
@@ -765,7 +765,7 @@ def pt_weights_iterator(
 def multi_thread_pt_weights_iterator(
     hf_weights_files: list[str],
     use_tqdm_on_load: bool,
-    pt_load_map_location: Union[str, dict[str, str]] = "cpu",
+    pt_load_map_location: str | dict[str, str] = "cpu",
     max_workers: int = 4,
 ) -> Generator[tuple[str, torch.Tensor], None, None]:
     """Multi-Thread iterate over the weights in the model bin/pt files."""
@@ -985,7 +985,7 @@ def initialize_dummy_weights(
                 param.uniform_(low, high, generator=generator)
 
 
-def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
+def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None:
     """Remap the name of FP8 k/v_scale parameters.
 
     This function handles the remapping of FP8 k/v_scale parameter names.
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index fd8a0b87e43ec..32073cb88de40 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -4,7 +4,7 @@
 import ast
 import inspect
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Optional, TypeVar, cast
+from typing import TYPE_CHECKING, Any, TypeVar, cast
 
 import torch
 import torch.nn as nn
@@ -32,7 +32,7 @@ _GENERATE_SUFFIXES = [
 ]
 
 
-def _load_st_projector(model_config: "ModelConfig") -> Optional[nn.Module]:
+def _load_st_projector(model_config: "ModelConfig") -> nn.Module | None:
     """Load Sentence-Transformers Dense projection layers."""
 
     try:
@@ -357,8 +357,8 @@ def as_seq_cls_model(cls: _T) -> _T:
             self,
             input_ids: torch.Tensor,
             positions: torch.Tensor,
-            intermediate_tensors: Optional[IntermediateTensors] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
+            intermediate_tensors: IntermediateTensors | None = None,
+            inputs_embeds: torch.Tensor | None = None,
         ) -> torch.Tensor:
             return super().forward(
                 input_ids, positions, intermediate_tensors, inputs_embeds
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index 2423ad5b0c3ad..5872e8196eada 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -4,7 +4,6 @@
 # A modified implementation of the AIMv2 Transformer
 # inserted here also the image tokenizer used by Ovis2
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -165,7 +164,7 @@ class AIMv2Transformer(nn.Module):
         config: AIMv2Config,
         quant_config: QuantizationConfig,
         *,
-        require_post_norm: Optional[bool] = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -196,7 +195,7 @@ class AIMv2Model(torch.nn.Module):
         config: AIMv2Config,
         quant_config: QuantizationConfig,
         *,
-        require_post_norm: Optional[bool] = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ):
         super().__init__()
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index c5d3d49d67602..72e5ddcf1abeb 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -27,7 +27,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -77,7 +77,7 @@ class ApertusMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
         reduce_results: bool = True,
@@ -120,12 +120,12 @@ class ApertusAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         bias_o_proj: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -225,8 +225,8 @@ class ApertusAttention(nn.Module):
     def _init_rotary_emb(
         self,
         config: ApertusConfig,
-        rope_scaling: Optional[dict[str, Any]],
-        quant_config: Optional[QuantizationConfig],
+        rope_scaling: dict[str, Any] | None,
+        quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
         is_gguf = quant_config and quant_config.get_name() == "gguf"
@@ -248,8 +248,8 @@ class ApertusDecoderLayer(nn.Module):
     def __init__(
         self,
         config: ApertusConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -316,7 +316,7 @@ class ApertusDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -394,13 +394,11 @@ class ApertusModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[
-        torch.Tensor, IntermediateTensors, tuple[torch.Tensor, list[torch.Tensor]]
-    ]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -586,9 +584,9 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         model_output = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -597,7 +595,7 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index 634e94b168143..08bf1a6aad75b 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -10,7 +10,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -52,7 +52,7 @@ class ArceeMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[Any] = None,
+        quant_config: Any | None = None,
         bias: bool = False,
         prefix: str = "",
         reduce_results: bool = True,
@@ -98,8 +98,8 @@ class ArceeDecoderLayer(nn.Module):
     def __init__(
         self,
         config: LlamaConfig,
-        cache_config: Optional[Any] = None,
-        quant_config: Optional[Any] = None,
+        cache_config: Any | None = None,
+        quant_config: Any | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -165,7 +165,7 @@ class ArceeDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self-Attention block
         if residual is None:
@@ -247,13 +247,11 @@ class ArceeModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[
-        torch.Tensor, IntermediateTensors, tuple[torch.Tensor, list[torch.Tensor]]
-    ]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
         # Embedding lookup (on first pipeline rank)
         if get_pp_group().is_first_rank:
             hidden_states = (
@@ -415,9 +413,9 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         model_output = self.model(
             input_ids=input_ids,
             positions=positions,
@@ -426,7 +424,7 @@ class ArceeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         )
         return model_output
 
-    def compute_logits(self, hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         # Compute final logits from hidden states (last pipeline rank only)
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 760df1cef82b0..e0b6444c91836 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -4,7 +4,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -63,7 +62,7 @@ class ArcticMLP(nn.Module):
         config: ArcticConfig,
         expert_id: int = -1,
         is_residual_mlp: bool = False,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ):
@@ -107,9 +106,9 @@ class ArcticMoE(nn.Module):
     def __init__(
         self,
         config: ArcticConfig,
-        tp_size: Optional[int] = None,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: int | None = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ):
@@ -265,8 +264,8 @@ class ArcticAttention(nn.Module):
     def __init__(
         self,
         config: ArcticConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -342,8 +341,8 @@ class ArcticDecoderLayer(nn.Module):
     def __init__(
         self,
         config: ArcticConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -443,9 +442,9 @@ class ArcticModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -499,9 +498,9 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -510,7 +509,7 @@ class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 734ae8cbd6087..222a425790543 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
@@ -71,7 +71,7 @@ class AriaImagePixelInputs(TensorSchema):
     ]
 
     pixel_mask: Annotated[
-        Optional[torch.Tensor],
+        torch.Tensor | None,
         TensorShape("bn", "h", "w"),
     ]
 
@@ -82,7 +82,7 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
     def __init__(
         self,
         config: Idefics2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__(config, quant_config=quant_config, prefix=prefix)
@@ -180,7 +180,7 @@ class AriaProjector(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
+        attn_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         batch_size, num_patches = x.shape[0], x.shape[1]
 
@@ -250,7 +250,7 @@ class AriaTextMoELayer(nn.Module):
     def __init__(
         self,
         config: AriaTextConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -415,7 +415,7 @@ class AriaProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(AriaProcessor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(self) -> int:
@@ -436,7 +436,7 @@ class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         vision_config = self.info.get_vision_config()
 
@@ -517,7 +517,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|fim_prefix|><|img|><|fim_suffix|>"
 
@@ -562,7 +562,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[AriaImagePixelInputs]:
+    ) -> AriaImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         pixel_mask = kwargs.pop("pixel_mask", None)
 
@@ -577,8 +577,8 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
 
     def _create_patch_attention_mask(
         self,
-        pixel_mask: Optional[torch.Tensor],
-    ) -> Optional[torch.Tensor]:
+        pixel_mask: torch.Tensor | None,
+    ) -> torch.Tensor | None:
         if pixel_mask is None:
             return None
 
@@ -628,10 +628,10 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if inputs_embeds is None:
             multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 6e93de524e482..839ab5947e094 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 from torch import nn
@@ -139,7 +139,7 @@ class AyaVisionProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor:
         return self.get_hf_processor(**kwargs).image_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_image_size_with_most_features(self) -> ImageSize:
@@ -187,7 +187,7 @@ class AyaVisionDummyInputsBuilder(BaseDummyInputsBuilder[AyaVisionProcessingInfo
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         image_size = self.info.get_image_size_with_most_features()
@@ -331,7 +331,7 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -375,7 +375,7 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
         self,
         vision_tower: SiglipVisionModel,
         pixel_values: torch.Tensor,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         return vision_tower(
             pixel_values.to(dtype=vision_tower.dtype),
             feature_select_strategy=self.config.vision_feature_select_strategy,
@@ -395,7 +395,7 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[AyaVisionImagePixelInputs]:
+    ) -> AyaVisionImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -428,10 +428,10 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -446,5 +446,5 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index a8f0e5993e2bc..ccf32c9ee1ac7 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -24,7 +24,6 @@
 import math
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -98,7 +97,7 @@ class BaiChuanMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -130,8 +129,8 @@ class BaiChuanAttention(nn.Module):
         position_embedding: str,
         rope_theta: float = 10000,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -213,8 +212,8 @@ class BaiChuanDecoderLayer(nn.Module):
         self,
         config: PretrainedConfig,
         position_embedding: str,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -246,7 +245,7 @@ class BaiChuanDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -305,9 +304,9 @@ class BaiChuanModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -422,9 +421,9 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -433,7 +432,7 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index c016d46e194f2..a7f3ebed644fc 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -26,7 +26,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -75,8 +74,8 @@ class BailingAttention(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ):
@@ -184,8 +183,8 @@ class BailingMLP(nn.Module):
         self,
         intermediate_size: int,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        reduce_results: Optional[bool] = True,
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool | None = True,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -218,8 +217,8 @@ class BailingMoE(nn.Module):
         self,
         intermediate_size: int,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        reduce_results: Optional[bool] = True,
+        quant_config: QuantizationConfig | None = None,
+        reduce_results: bool | None = True,
         prefix: str = "",
     ):
         super().__init__()
@@ -340,8 +339,8 @@ class BailingMoeBlock(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -370,7 +369,7 @@ class BailingMoeBlock(nn.Module):
         self,
         hidden_states: torch.Tensor,
         position_ids: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         if residual is None:
             residual = hidden_states
@@ -447,9 +446,9 @@ class BailingMoeModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -619,9 +618,9 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         model_output = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -630,7 +629,7 @@ class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 42c1c7be1a75a..1a06f0659235e 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -4,7 +4,6 @@
 
 # Added by the IBM Team, 2024
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
@@ -52,7 +51,7 @@ class BambaMLP(nn.Module):
     def __init__(
         self,
         config: BambaConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
     ) -> None:
         super().__init__()
@@ -87,9 +86,9 @@ class BambaMixerDecoderLayer(nn.Module):
         self,
         config: BambaConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -119,7 +118,7 @@ class BambaMixerDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -141,9 +140,9 @@ class BambaAttentionDecoderLayer(nn.Module):
         self,
         config: BambaConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -235,7 +234,7 @@ class BambaAttentionDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -314,8 +313,8 @@ class BambaModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -497,8 +496,8 @@ class BambaForCausalLM(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ):
         hidden_states = self.model(
@@ -510,7 +509,7 @@ class BambaForCausalLM(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index d9d4c62639d50..e07da3d4d29ae 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Set
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -66,7 +65,7 @@ class BertEmbedding(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         token_type_ids = _decode_token_type_ids(input_ids)
 
@@ -103,9 +102,9 @@ class BertPooler(Pooler):
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         pooled_output = self.pooling(hidden_states, pooling_metadata)
 
         if isinstance(pooled_output, list):
@@ -147,8 +146,8 @@ class BertLayer(nn.Module):
     def __init__(
         self,
         config: BertConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -191,8 +190,8 @@ class BertAttention(nn.Module):
         hidden_size: int,
         num_attention_heads: int,
         layer_norm_eps: float,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -225,8 +224,8 @@ class BertSelfAttention(nn.Module):
         self,
         hidden_size: int,
         num_attention_heads: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -281,7 +280,7 @@ class BertSelfOutput(nn.Module):
         self,
         hidden_size: int,
         layer_norm_eps: float,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -308,7 +307,7 @@ class BertIntermediate(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -333,7 +332,7 @@ class BertOutput(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         layer_norm_eps: float,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -383,8 +382,8 @@ class BertModel(nn.Module, SupportsQuant):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         hidden_states = self.embeddings(
             input_ids=input_ids,
@@ -494,8 +493,8 @@ class BertEmbeddingModel(nn.Module, SupportsQuant):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return self.model(
             input_ids=input_ids,
@@ -636,11 +635,11 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQu
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        token_type_ids: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if token_type_ids is not None:
             assert self.bert.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
@@ -692,11 +691,11 @@ class BertForTokenClassification(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        token_type_ids: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if token_type_ids is not None:
             assert self.bert.config.vocab_size < (1 << TOKEN_TYPE_SHIFT)
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 05cb0e22a0aad..49111dd9ffab5 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
@@ -67,7 +66,7 @@ class BertWithRopeEmbedding(nn.Module):
     def forward(
         self,
         input_ids: torch.Tensor,
-        token_type_ids: Optional[torch.Tensor] = None,
+        token_type_ids: torch.Tensor | None = None,
     ) -> torch.Tensor:
         input_shape = input_ids.size()
         inputs_embeds = self.word_embeddings(input_ids)
@@ -91,10 +90,10 @@ class BertWithRopeAttention(nn.Module):
         self,
         hidden_size: int,
         num_attention_heads: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = True,
-        rotary_kwargs: Optional[dict] = None,
+        rotary_kwargs: dict | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -166,7 +165,7 @@ class BertWithRopeGatedMLP(nn.Module):
         intermediate_size: int,
         hidden_act: str,
         bias: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -200,7 +199,7 @@ class BertWithRopeMLP(nn.Module):
         intermediate_size: int,
         hidden_act: str,
         bias: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -235,8 +234,8 @@ class NomicMoE(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        params_dtype: Optional[torch.dtype] = None,
-        tp_size: Optional[int] = None,
+        params_dtype: torch.dtype | None = None,
+        tp_size: int | None = None,
     ):
         super().__init__()
 
@@ -344,11 +343,11 @@ class BertWithRopeBlock(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         moe: bool = False,
         bias: bool = True,
-        rotary_kwargs: Optional[dict] = None,
+        rotary_kwargs: dict | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -406,7 +405,7 @@ class BertWithRopeEncoder(nn.Module):
         self,
         vllm_config: VllmConfig,
         bias: bool = True,
-        rotary_kwargs: Optional[dict] = None,
+        rotary_kwargs: dict | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -471,9 +470,9 @@ class BertWithRope(nn.Module, SupportsQuant):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        token_type_ids: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
@@ -724,10 +723,10 @@ class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return self.new(
             input_ids=input_ids,
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index aa361e0a2a398..2e4f73312efa3 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -4,7 +4,6 @@
 within a vision language model."""
 
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -38,7 +37,7 @@ def get_blip_num_patches(*, image_size: int, patch_size: int) -> int:
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
 class BlipVisionEmbeddings(nn.Module):
-    def __init__(self, config: Union[BlipVisionConfig, Blip2VisionConfig]):
+    def __init__(self, config: BlipVisionConfig | Blip2VisionConfig):
         super().__init__()
 
         self.config = config
@@ -86,8 +85,8 @@ class BlipAttention(nn.Module):
 
     def __init__(
         self,
-        config: Union[BlipVisionConfig, Blip2VisionConfig],
-        quant_config: Optional[QuantizationConfig] = None,
+        config: BlipVisionConfig | Blip2VisionConfig,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -151,7 +150,7 @@ class BlipMLP(nn.Module):
     def __init__(
         self,
         config: BlipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -186,7 +185,7 @@ class BlipEncoderLayer(nn.Module):
     def __init__(
         self,
         config: BlipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -228,8 +227,8 @@ class BlipEncoder(nn.Module):
     def __init__(
         self,
         config: BlipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        num_hidden_layers_override: Optional[int] = None,
+        quant_config: QuantizationConfig | None = None,
+        num_hidden_layers_override: int | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -268,10 +267,10 @@ class BlipVisionModel(nn.Module, SupportsQuant):
     def __init__(
         self,
         config: BlipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 8e94d59350268..2986a72f2e487 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
@@ -70,7 +70,7 @@ class Blip2ImageEmbeddingInputs(TensorSchema):
     data: Annotated[torch.Tensor, TensorShape("bn", "f", "h")]
 
 
-Blip2ImageInputs = Union[Blip2ImagePixelInputs, Blip2ImageEmbeddingInputs]
+Blip2ImageInputs: TypeAlias = Blip2ImagePixelInputs | Blip2ImageEmbeddingInputs
 
 
 class Blip2QFormerMultiHeadAttention(nn.Module):
@@ -78,8 +78,8 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
         self,
         config: Blip2QFormerConfig,
         *,
-        quant_config: Optional[QuantizationConfig],
-        cache_config: Optional[CacheConfig],
+        quant_config: QuantizationConfig | None,
+        cache_config: CacheConfig | None,
         is_cross_attention: bool = False,
         prefix: str = "",
     ) -> None:
@@ -123,7 +123,7 @@ class Blip2QFormerMultiHeadAttention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: torch.FloatTensor | None = None,
     ):
         is_cross_attention = encoder_hidden_states is not None
 
@@ -179,8 +179,8 @@ class Blip2QFormerAttention(nn.Module):
         self,
         config: Blip2QFormerConfig,
         *,
-        quant_config: Optional[QuantizationConfig],
-        cache_config: Optional[CacheConfig],
+        quant_config: QuantizationConfig | None,
+        cache_config: CacheConfig | None,
         is_cross_attention: bool = False,
         prefix: str = "",
     ) -> None:
@@ -199,7 +199,7 @@ class Blip2QFormerAttention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: torch.FloatTensor | None = None,
     ) -> tuple[torch.Tensor]:
         self_output = self.attention(
             hidden_states,
@@ -247,8 +247,8 @@ class Blip2QFormerLayer(nn.Module):
         self,
         config: Blip2QFormerConfig,
         *,
-        quant_config: Optional[QuantizationConfig],
-        cache_config: Optional[CacheConfig],
+        quant_config: QuantizationConfig | None,
+        cache_config: CacheConfig | None,
         layer_idx: int,
         prefix: str = "",
     ) -> None:
@@ -340,8 +340,8 @@ class Blip2QFormerEncoder(nn.Module):
         self,
         config: Blip2QFormerConfig,
         *,
-        quant_config: Optional[QuantizationConfig],
-        cache_config: Optional[CacheConfig],
+        quant_config: QuantizationConfig | None,
+        cache_config: CacheConfig | None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -385,8 +385,8 @@ class Blip2QFormerModel(nn.Module):
         self,
         config: Blip2QFormerConfig,
         *,
-        quant_config: Optional[QuantizationConfig],
-        cache_config: Optional[CacheConfig],
+        quant_config: QuantizationConfig | None,
+        cache_config: CacheConfig | None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -426,7 +426,7 @@ class Blip2ProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(Blip2Config)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": 1}
 
     def get_num_image_tokens(self) -> int:
@@ -442,7 +442,7 @@ class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
@@ -526,7 +526,7 @@ class Blip2ForConditionalGeneration(
     merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -573,7 +573,7 @@ class Blip2ForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Blip2ImageInputs]:
+    ) -> Blip2ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -641,8 +641,8 @@ class Blip2ForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
         """Run forward pass for BLIP-2.
@@ -687,7 +687,7 @@ class Blip2ForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 4a814fc4020d7..bbbd14adf92b2 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -22,7 +22,6 @@
 import math
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -89,8 +88,8 @@ class BloomAttention(nn.Module):
     def __init__(
         self,
         config: BloomConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -152,7 +151,7 @@ class BloomMLP(nn.Module):
     def __init__(
         self,
         config: BloomConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -179,8 +178,8 @@ class BloomBlock(nn.Module):
     def __init__(
         self,
         config: BloomConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -274,9 +273,9 @@ class BloomModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -356,9 +355,9 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsQuant):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.transformer(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -367,7 +366,7 @@ class BloomForCausalLM(nn.Module, SupportsPP, SupportsQuant):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index d8756e236f4cc..2ca761dd2b550 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from itertools import islice
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal
 
 import torch
 import torch.nn as nn
@@ -94,7 +94,7 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(ChameleonProcessor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": 1}
 
     def get_num_image_tokens(self) -> int:
@@ -115,7 +115,7 @@ class ChameleonDummyInputsBuilder(BaseDummyInputsBuilder[ChameleonProcessingInfo
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         config = self.info.get_hf_config()
 
@@ -225,7 +225,7 @@ class ChameleonMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
     ) -> None:
         super().__init__()
@@ -262,11 +262,11 @@ class ChameleonAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 4096,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -357,8 +357,8 @@ class ChameleonDecoderLayer(nn.Module):
     def __init__(
         self,
         config: ChameleonConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -403,8 +403,8 @@ class ChameleonDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
@@ -426,8 +426,8 @@ class ChameleonSwinDecoderLayer(nn.Module):
     def __init__(
         self,
         config: ChameleonConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -472,7 +472,7 @@ class ChameleonSwinDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         residual = hidden_states
         hidden_states = self.self_attn(
@@ -896,11 +896,11 @@ class ChameleonModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -941,7 +941,7 @@ class ChameleonForConditionalGeneration(
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -975,7 +975,7 @@ class ChameleonForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[ChameleonImagePixelInputs]:
+    ) -> ChameleonImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
 
         if pixel_values is None:
@@ -1008,10 +1008,10 @@ class ChameleonForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -1023,7 +1023,7 @@ class ChameleonForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
 
         # Disallow image tokens which does not include special
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index ece719df61f7c..bcbe82b78c3b1 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -7,7 +7,6 @@
 import json
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -50,8 +49,8 @@ class GLMAttention(nn.Module):
     def __init__(
         self,
         config: ChatGLMConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -144,7 +143,7 @@ class GLMMLP(nn.Module):
     def __init__(
         self,
         config: ChatGLMConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -190,8 +189,8 @@ class GLMBlock(nn.Module):
     def __init__(
         self,
         config: ChatGLMConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -263,8 +262,8 @@ class GLMTransformer(nn.Module):
     def __init__(
         self,
         config: ChatGLMConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -295,7 +294,7 @@ class GLMTransformer(nn.Module):
         self,
         hidden_states: torch.Tensor,
         position_ids: torch.Tensor,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(
                 hidden_states=hidden_states, position_ids=position_ids
@@ -361,10 +360,10 @@ class ChatGLMModel(nn.Module, SupportsQuant):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -459,7 +458,7 @@ class ChatGLMBaseModel(nn.Module):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
@@ -491,9 +490,9 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, SupportsQua
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.transformer(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index f05d5c4cc1d8b..3d7b28af8bdbe 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
@@ -125,7 +125,7 @@ class CLIPProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(CLIPProcessor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": 1}
 
     def get_num_image_tokens(
@@ -169,7 +169,7 @@ class CLIPDummyInputsBuilder(BaseDummyInputsBuilder[CLIPProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
@@ -199,12 +199,12 @@ class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]):
 
     def apply(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        tokenization_kwargs: Mapping[str, object] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
         if prompt and mm_data:
             raise ValueError(
@@ -286,9 +286,9 @@ class CLIPTextEmbeddings(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         position_ids: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if inputs_embeds is None:
             if input_ids is None:
@@ -350,11 +350,11 @@ class CLIPVisionEmbeddings(nn.Module):
 class CLIPAttention(nn.Module):
     def __init__(
         self,
-        config: Union[CLIPTextConfig, CLIPVisionConfig],
-        quant_config: Optional[QuantizationConfig] = None,
+        config: CLIPTextConfig | CLIPVisionConfig,
+        quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: Union[type[Attention], type[MultiHeadAttention]],
+        attn_cls: type[Attention] | type[MultiHeadAttention],
     ) -> None:
         super().__init__()
 
@@ -412,8 +412,8 @@ class CLIPAttention(nn.Module):
 class CLIPMLP(nn.Module):
     def __init__(
         self,
-        config: Union[CLIPTextConfig, CLIPVisionConfig],
-        quant_config: Optional[QuantizationConfig] = None,
+        config: CLIPTextConfig | CLIPVisionConfig,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -445,11 +445,11 @@ class CLIPMLP(nn.Module):
 class CLIPEncoderLayer(nn.Module):
     def __init__(
         self,
-        config: Union[CLIPTextConfig, CLIPVisionConfig],
-        quant_config: Optional[QuantizationConfig] = None,
+        config: CLIPTextConfig | CLIPVisionConfig,
+        quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
-        attn_cls: Union[type[Attention], type[MultiHeadAttention]],
+        attn_cls: type[Attention] | type[MultiHeadAttention],
     ) -> None:
         super().__init__()
         self.self_attn = CLIPAttention(
@@ -488,12 +488,12 @@ class CLIPEncoder(nn.Module):
 
     def __init__(
         self,
-        config: Union[CLIPTextConfig, CLIPVisionConfig],
-        quant_config: Optional[QuantizationConfig] = None,
-        num_hidden_layers_override: Optional[int] = None,
+        config: CLIPTextConfig | CLIPVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        num_hidden_layers_override: int | None = None,
         *,
         prefix: str = "",
-        attn_cls: Union[type[Attention], type[MultiHeadAttention]],
+        attn_cls: type[Attention] | type[MultiHeadAttention],
     ) -> None:
         super().__init__()
 
@@ -519,7 +519,7 @@ class CLIPEncoder(nn.Module):
         self,
         inputs_embeds: torch.Tensor,
         return_all_hidden_states: bool,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         hidden_states_pool = [inputs_embeds]
         hidden_states = inputs_embeds
 
@@ -538,7 +538,7 @@ class CLIPTextTransformer(nn.Module):
     def __init__(
         self,
         config: CLIPTextConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
     ) -> None:
@@ -566,9 +566,9 @@ class CLIPTextTransformer(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         position_ids: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         hidden_states = self.embeddings(
             input_ids=input_ids,
@@ -616,10 +616,10 @@ class CLIPVisionTransformer(nn.Module):
     def __init__(
         self,
         config: CLIPVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -669,8 +669,8 @@ class CLIPVisionTransformer(nn.Module):
         self,
         pixel_values: torch.Tensor,
         *,
-        select_layers: Optional[list[int]] = None,
-        feature_select_strategy: Optional[VisionFeatureSelectStrategy] = None,
+        select_layers: list[int] | None = None,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
     ) -> torch.Tensor:
         hidden_states = self.embeddings(pixel_values)
         hidden_states = self.pre_layrnorm(hidden_states)
@@ -736,10 +736,10 @@ class CLIPVisionModel(nn.Module):
     def __init__(
         self,
         config: CLIPVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -755,8 +755,8 @@ class CLIPVisionModel(nn.Module):
     def forward(
         self,
         pixel_values: torch.Tensor,
-        select_layers: Optional[list[int]] = None,
-        feature_select_strategy: Optional[VisionFeatureSelectStrategy] = None,
+        select_layers: list[int] | None = None,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
     ) -> torch.Tensor:
         return self.vision_model(
             pixel_values,
@@ -787,7 +787,7 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -847,9 +847,9 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
 
     def get_text_features(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         position_ids: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         pooled_output = self.text_model(
             input_ids=input_ids,
@@ -864,7 +864,7 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     def get_image_features(
         self,
         pixel_values: torch.Tensor,
-        feature_select_strategy: Optional[VisionFeatureSelectStrategy] = None,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
     ) -> torch.Tensor:
         if feature_select_strategy is None:
             feature_select_strategy = _get_vision_feature_select_strategy(
@@ -883,7 +883,7 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[CLIPImagePixelInputs]:
+    ) -> CLIPImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         if pixel_values is None:
             return None
@@ -906,9 +906,9 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         self._is_text_input = (
@@ -936,10 +936,10 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index 73aafbd011444..19cc31c9bd18b 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -4,7 +4,7 @@
 """Command-A-Vision (Cohere2Vision) multimodal model implementation for vLLM."""
 
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 from torch import nn
@@ -148,7 +148,7 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self, **kwargs: object):
         return self.get_hf_processor(**kwargs).image_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_image_size_with_most_features(self) -> ImageSize:
@@ -163,7 +163,7 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Cohere2VisionProcessor],
+        processor: Cohere2VisionProcessor | None,
     ) -> int:
         """
         Calculate the number of image patches for a given image.
@@ -217,7 +217,7 @@ class Cohere2VisionDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         image_size = self.info.get_image_size_with_most_features()
@@ -404,7 +404,7 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, Suppo
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Cohere2VisionImagePixelInputs]:
+    ) -> Cohere2VisionImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -450,10 +450,10 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, Suppo
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -468,5 +468,5 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, Suppo
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index e38c3c0492fbf..75459601f76b0 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -25,7 +25,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -94,8 +93,8 @@ class LayerNorm(nn.Module):
 class CohereMLP(nn.Module):
     def __init__(
         self,
-        config: Union[CohereConfig, Cohere2Config],
-        quant_config: Optional[QuantizationConfig] = None,
+        config: CohereConfig | Cohere2Config,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -128,9 +127,9 @@ class CohereMLP(nn.Module):
 class CohereAttention(nn.Module):
     def __init__(
         self,
-        config: Union[CohereConfig, Cohere2Config],
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        config: CohereConfig | Cohere2Config,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -241,9 +240,9 @@ class CohereAttention(nn.Module):
 class CohereDecoderLayer(nn.Module):
     def __init__(
         self,
-        config: Union[CohereConfig, Cohere2Config],
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        config: CohereConfig | Cohere2Config,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -265,7 +264,7 @@ class CohereDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         residual = hidden_states
@@ -324,9 +323,9 @@ class CohereModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -452,9 +451,9 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -463,7 +462,7 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         is_not_lora = hasattr(self.model.embed_tokens, "weight")
         if is_not_lora:
             logits = self.logits_processor(self.model.embed_tokens, hidden_states)
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 8ec7a82a7b2ad..088960e064489 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -3,7 +3,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -54,7 +53,7 @@ class DbrxRouter(nn.Module):
     def __init__(
         self,
         config: DbrxConfig,
-        params_dtype: Optional[torch.dtype] = None,
+        params_dtype: torch.dtype | None = None,
     ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -77,8 +76,8 @@ class DbrxExperts(FusedMoE):
     def __init__(
         self,
         config: DbrxConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        params_dtype: Optional[torch.dtype] = None,
+        quant_config: QuantizationConfig | None = None,
+        params_dtype: torch.dtype | None = None,
         prefix: str = "",
     ):
         super().__init__(
@@ -157,8 +156,8 @@ class DbrxMoE(nn.Module):
     def __init__(
         self,
         config: DbrxConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        params_dtype: Optional[torch.dtype] = None,
+        quant_config: QuantizationConfig | None = None,
+        params_dtype: torch.dtype | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -189,8 +188,8 @@ class DbrxAttention(nn.Module):
     def __init__(
         self,
         config: DbrxConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -270,8 +269,8 @@ class DbrxFusedNormAttention(nn.Module):
     def __init__(
         self,
         config: DbrxConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -303,8 +302,8 @@ class DbrxBlock(nn.Module):
     def __init__(
         self,
         config: DbrxConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -361,9 +360,9 @@ class DbrxModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -466,9 +465,9 @@ class DbrxForCausalLM(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.transformer(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -477,7 +476,7 @@ class DbrxForCausalLM(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 67258c2f77b83..ac934abea45df 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -26,7 +26,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -76,7 +76,7 @@ class DeepseekMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ) -> None:
@@ -108,7 +108,7 @@ class DeepseekMoE(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -203,10 +203,10 @@ class DeepseekAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -282,8 +282,8 @@ class DeepseekDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -328,7 +328,7 @@ class DeepseekDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
@@ -382,9 +382,9 @@ class DeepseekModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -489,9 +489,9 @@ class DeepseekForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -500,7 +500,7 @@ class DeepseekForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index faa7edd4bc3c3..107b1e1a05823 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -224,7 +223,7 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if inputs_embeds is not None:
             raise NotImplementedError(
@@ -235,7 +234,7 @@ class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index bf3ab7bb3079b..de80833130179 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -30,7 +29,7 @@ class SharedHead(nn.Module):
         self,
         config: PretrainedConfig,
         prefix: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -84,7 +83,7 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_index: int = 0,
     ) -> torch.Tensor:
         assert inputs_embeds is not None
@@ -136,7 +135,7 @@ class DeepSeekMultiTokenPredictor(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         if inputs_embeds is None:
@@ -180,8 +179,8 @@ class DeepSeekMTP(nn.Module, SupportsPP):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         hidden_states = self.model(
@@ -193,7 +192,7 @@ class DeepSeekMTP(nn.Module, SupportsPP):
         self,
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.model.compute_logits(hidden_states, spec_step_idx)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index f8456c5452494..970fa80826aba 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -27,7 +27,7 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -106,7 +106,7 @@ class DeepseekV2MLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         is_sequence_parallel=False,
         prefix: str = "",
@@ -150,9 +150,9 @@ class DeepseekV2MLP(nn.Module):
 class DeepseekV2MoE(nn.Module):
     def __init__(
         self,
-        config: Union[DeepseekV2Config, DeepseekV3Config],
+        config: DeepseekV2Config | DeepseekV3Config,
         parallel_config: ParallelConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -301,7 +301,7 @@ class DeepseekV2Attention(nn.Module):
     def __init__(
         self,
         vllm_config: VllmConfig,
-        config: Union[DeepseekV2Config, DeepseekV3Config],
+        config: DeepseekV2Config | DeepseekV3Config,
         hidden_size: int,
         num_heads: int,
         qk_nope_head_dim: int,
@@ -310,11 +310,11 @@ class DeepseekV2Attention(nn.Module):
         q_lora_rank: int,
         kv_lora_rank: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        topk_indices_buffer: Optional[torch.Tensor] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        topk_indices_buffer: torch.Tensor | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -554,12 +554,12 @@ def sparse_attn_indexer(
     k: torch.Tensor,
     weights: torch.Tensor,
     quant_block_size: int,
-    scale_fmt: Optional[str],
+    scale_fmt: str | None,
     topk_tokens: int,
     head_dim: int,
     max_model_len: int,
     total_seq_lens: int,
-    topk_indices_buffer: Optional[torch.Tensor],
+    topk_indices_buffer: torch.Tensor | None,
 ) -> torch.Tensor:
     # careful! this will be None in dummy run
     attn_metadata = get_forward_context().attn_metadata
@@ -727,12 +727,12 @@ def sparse_attn_indexer_fake(
     k: torch.Tensor,
     weights: torch.Tensor,
     quant_block_size: int,
-    scale_fmt: Optional[str],
+    scale_fmt: str | None,
     topk_tokens: int,
     head_dim: int,
     max_model_len: int,
     total_seq_lens: int,
-    topk_indices_buffer: Optional[torch.Tensor],
+    topk_indices_buffer: torch.Tensor | None,
 ) -> torch.Tensor:
     # profile run
     # NOTE(Chen): create the max possible flattened_kv. So that
@@ -758,12 +758,12 @@ class Indexer(nn.Module):
     def __init__(
         self,
         vllm_config: VllmConfig,
-        config: Union[DeepseekV2Config, DeepseekV3Config],
+        config: DeepseekV2Config | DeepseekV3Config,
         hidden_size: int,
         q_lora_rank: int,
-        quant_config: Optional[QuantizationConfig],
-        cache_config: Optional[CacheConfig],
-        topk_indices_buffer: Optional[torch.Tensor],
+        quant_config: QuantizationConfig | None,
+        cache_config: CacheConfig | None,
+        topk_indices_buffer: torch.Tensor | None,
         prefix: str = "",
     ):
         super().__init__()
@@ -880,21 +880,21 @@ class DeepseekV2MLAAttention(nn.Module):
     def __init__(
         self,
         vllm_config: VllmConfig,
-        config: Union[DeepseekV2Config, DeepseekV3Config],
+        config: DeepseekV2Config | DeepseekV3Config,
         hidden_size: int,
         num_heads: int,
         qk_nope_head_dim: int,
         qk_rope_head_dim: int,
         v_head_dim: int,
-        q_lora_rank: Optional[int],
+        q_lora_rank: int | None,
         kv_lora_rank: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        topk_indices_buffer: Optional[torch.Tensor] = None,
+        topk_indices_buffer: torch.Tensor | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -1045,8 +1045,8 @@ class DeepseekV2DecoderLayer(nn.Module):
         self,
         vllm_config: VllmConfig,
         prefix: str,
-        config: Optional[DeepseekV2Config] = None,
-        topk_indices_buffer: Optional[torch.Tensor] = None,
+        config: DeepseekV2Config | None = None,
+        topk_indices_buffer: torch.Tensor | None = None,
     ) -> None:
         super().__init__()
 
@@ -1117,7 +1117,7 @@ class DeepseekV2DecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
@@ -1212,9 +1212,9 @@ class DeepseekV2Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -1347,9 +1347,9 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, SupportsLoR
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -1358,7 +1358,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts, SupportsLoR
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
@@ -1497,8 +1497,8 @@ class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
 # Compatibility with
 # https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py
 def get_spec_layer_idx_from_weight_name(
-    config: Union[DeepseekV2Config, DeepseekV3Config], weight_name: str
-) -> Optional[int]:
+    config: DeepseekV2Config | DeepseekV3Config, weight_name: str
+) -> int | None:
     if (
         hasattr(config, "num_nextn_predict_layers")
         and config.num_nextn_predict_layers > 0
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 8226e88c47a2c..094a7e73b3aae 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -6,7 +6,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
@@ -88,14 +88,12 @@ class DeepseekVL2VImageEmbeddingInputs(TensorSchema):
     """
 
     type: Literal["image_embeds"]
-    data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]], TensorShape("bn", "f", "h")
-    ]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("bn", "f", "h")]
 
 
-DeepseekVL2ImageInputs = Union[
-    DeepseekVL2ImagePixelInputs, DeepseekVL2VImageEmbeddingInputs
-]
+DeepseekVL2ImageInputs: TypeAlias = (
+    DeepseekVL2ImagePixelInputs | DeepseekVL2VImageEmbeddingInputs
+)
 
 
 class MlpProjector(nn.Module):
@@ -161,7 +159,7 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(DeepseekVLV2Processor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -214,7 +212,7 @@ class DeepseekVL2DummyInputsBuilder(BaseDummyInputsBuilder[DeepseekVL2Processing
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
@@ -310,11 +308,11 @@ class DeepseekVL2MultiModalProcessor(
 
     def _cached_apply_hf_processor(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 2 vs > 2
         # Since the processing cache assumes that the processor output is
@@ -353,7 +351,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -454,7 +452,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def _init_vision_module(
         self,
         vision_config: VisionEncoderConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
     ) -> nn.Module:
         # TODO: refactor vision model through timm wrapper from transformers
@@ -480,7 +478,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[DeepseekVL2ImageInputs]:
+    ) -> DeepseekVL2ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         images_spatial_crop = kwargs.pop("images_spatial_crop", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -637,8 +635,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ):
         if intermediate_tensors is not None:
@@ -653,7 +651,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 55f8d4b231f78..c33cb3d84478e 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -27,7 +27,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -80,7 +80,7 @@ class Dots1MLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ) -> None:
@@ -117,7 +117,7 @@ class Dots1MoE(nn.Module):
     def __init__(
         self,
         config: Dots1Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -203,10 +203,10 @@ class Dots1Attention(nn.Module):
         num_kv_heads: int,
         config: Dots1Config,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -289,8 +289,8 @@ class Dots1DecoderLayer(nn.Module):
         config: Dots1Config,
         prefix: str,
         model_config: ModelConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -338,7 +338,7 @@ class Dots1DecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         if residual is None:
             residual = hidden_states
@@ -403,9 +403,9 @@ class Dots1Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -546,9 +546,9 @@ class Dots1ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids,
             positions,
@@ -560,7 +560,7 @@ class Dots1ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index d1a9f4cb3b2e7..bd7f37b07de32 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
@@ -92,7 +92,7 @@ class DotsOCRImageEmbeddingInputs(TensorSchema):
     image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
-DotsOCRImageInputs = Union[DotsOCRImagePixelInputs, DotsOCRImageEmbeddingInputs]
+DotsOCRImageInputs: TypeAlias = DotsOCRImagePixelInputs | DotsOCRImageEmbeddingInputs
 
 
 class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder):
@@ -104,7 +104,7 @@ class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
@@ -134,7 +134,7 @@ class DotsOCRProcessingInfo(Qwen2VLProcessingInfo):
 
         return config
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_mm_max_tokens_per_item(
@@ -253,7 +253,7 @@ class DotsVisionAttention(nn.Module):
         num_heads: int = 16,
         bias: bool = True,
         *,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -316,10 +316,10 @@ class DotsVisionAttention(nn.Module):
         self,
         hidden_states: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
+        rotary_pos_emb: torch.Tensor | None = None,
         *,
-        max_seqlen: Optional[int] = None,
-        seqlens: Optional[list[int]] = None,
+        max_seqlen: int | None = None,
+        seqlens: list[int] | None = None,
     ) -> torch.Tensor:
         # [S, C] -> [S, B=1, C]
         x = hidden_states.unsqueeze(1)
@@ -394,7 +394,7 @@ class DotsSwiGLUFFN(nn.Module):
         self,
         config,
         *,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -507,7 +507,7 @@ class DotsVisionBlock(nn.Module):
         self,
         config,
         *,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -537,8 +537,8 @@ class DotsVisionBlock(nn.Module):
         *,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: Optional[int] = None,
-        seqlens: Optional[list[int]] = None,
+        max_seqlen: int | None = None,
+        seqlens: list[int] | None = None,
     ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
@@ -555,10 +555,10 @@ class DotsVisionTransformer(nn.Module):
     def __init__(
         self,
         config: DotsVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -653,7 +653,7 @@ class DotsVisionTransformer(nn.Module):
 
     def compute_attn_mask_seqlen(
         self, cu_seqlens: torch.Tensor
-    ) -> tuple[Optional[int], Optional[list[int]]]:
+    ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
         if (
             self.attn_backend == _Backend.FLASH_ATTN
@@ -734,7 +734,7 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|img|><|imgpad|><|endofimg|>"
 
@@ -765,7 +765,7 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[DotsOCRImageInputs]:
+    ) -> DotsOCRImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -834,10 +834,10 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
@@ -861,7 +861,7 @@ class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index f0360d55a2e57..e01f26731cd92 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -25,7 +25,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -83,7 +83,7 @@ class Ernie4_5_MoeMLP(nn.Module):
         intermediate_size: int,
         hidden_act: str,
         use_bias: bool = False,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ) -> None:
@@ -120,7 +120,7 @@ class Ernie4_5_MoeMoE(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ):
@@ -229,14 +229,14 @@ class Ernie4_5_MoeAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         rope_theta: float = 500000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 131072,
         rms_norm_eps: float = 1e-05,
         qkv_bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -323,8 +323,8 @@ class Ernie4_5_MoeDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ) -> None:
@@ -391,7 +391,7 @@ class Ernie4_5_MoeDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
@@ -467,9 +467,9 @@ class Ernie4_5_MoeModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -737,9 +737,9 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExpe
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -748,7 +748,7 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExpe
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index d5b2caa2ddfd6..dc465c87cf4b9 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -25,9 +25,9 @@
 
 import itertools
 import math
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+from typing import Annotated, Any, Literal
 
 import numpy as np
 import torch
@@ -162,7 +162,7 @@ class Ernie4_5_VisionAttention(nn.Module):
         embed_dim: int,
         num_heads: int,
         projection_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -254,8 +254,8 @@ class Ernie4_5_VisionAttention(nn.Module):
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-        seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -332,7 +332,7 @@ class Ernie4_5_VisionMLP(nn.Module):
         in_features: int,
         hidden_features: int,
         act_layer: type[nn.Module] = QuickGELU,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -364,8 +364,8 @@ class Ernie4_5_VisionBlock(nn.Module):
         num_heads: int,
         mlp_ratio: float,
         act_layer: type[nn.Module] = QuickGELU,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -397,8 +397,8 @@ class Ernie4_5_VisionBlock(nn.Module):
         hidden_states: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-        seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
@@ -456,7 +456,7 @@ class Ernie4_5_VisionTransformer(nn.Module):
         self,
         vision_config,
         norm_eps: float = 1e-6,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -553,7 +553,7 @@ class Ernie4_5_VisionTransformer(nn.Module):
 
     def compute_attn_mask_seqlen(
         self, cu_seqlens: torch.Tensor
-    ) -> tuple[Optional[int], Optional[list[int]]]:
+    ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
         if (
             self.attn_backend == _Backend.FLASH_ATTN
@@ -659,15 +659,15 @@ Ernie4_5_VLVideoInputs = Ernie4_5_VLVideoPixelInputs
 # === Vision Processor === #
 
 
-def round_by_factor(number: Union[int, float], factor: int) -> int:
+def round_by_factor(number: int | float, factor: int) -> int:
     return round(number / factor) * factor
 
 
-def ceil_by_factor(number: Union[int, float], factor: int) -> int:
+def ceil_by_factor(number: int | float, factor: int) -> int:
     return math.ceil(number / factor) * factor
 
 
-def floor_by_factor(number: Union[int, float], factor: int) -> int:
+def floor_by_factor(number: int | float, factor: int) -> int:
     return math.floor(number / factor) * factor
 
 
@@ -901,7 +901,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self, **kwargs: object):
         return self.get_hf_processor(**kwargs).image_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": None}
 
     def get_mm_max_tokens_per_item(
@@ -920,7 +920,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         image_height: int,
         num_frames: int = 1,
         do_resize: bool = True,
-        image_processor: Optional[Any],
+        image_processor: Any | None,
     ) -> tuple[ImageSize, int]:
         if image_processor is None:
             image_processor = self.get_image_processor()
@@ -957,7 +957,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        image_processor: Optional[Any],
+        image_processor: Any | None,
     ) -> int:
         _, num_image_tokens = self._get_vision_info(
             image_width=image_width,
@@ -972,7 +972,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         num_frames: int,
-        image_processor: Optional[Any],
+        image_processor: Any | None,
     ) -> int:
         _, num_video_tokens = self._get_vision_info(
             image_width=image_width,
@@ -1237,7 +1237,7 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -1310,7 +1310,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
         if modality.startswith("video"):
@@ -1356,7 +1356,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """compute logits"""
         return self.language_model.compute_logits(hidden_states)
 
@@ -1395,12 +1395,12 @@ class Ernie4_5_VLMoeForConditionalGeneration(
         cls,
         input_tokens: list[int],
         hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        image_grid_thw: list[list[int]] | torch.Tensor,
+        video_grid_thw: list[list[int]] | torch.Tensor,
         context_len: int = 0,
-        seq_len: Optional[int] = None,
-        second_per_grid_ts: Optional[list[float]] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
+        seq_len: int | None = None,
+        second_per_grid_ts: list[float] | None = None,
+        audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
         """Get mrope input positions and delta value for Ernie VL."""
@@ -1540,7 +1540,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Ernie4_5_VLImageInputs]:
+    ) -> Ernie4_5_VLImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
 
@@ -1556,7 +1556,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[Ernie4_5_VLVideoInputs]:
+    ) -> Ernie4_5_VLVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
 
@@ -1631,7 +1631,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
 
     def get_multimodal_embeddings(
         self, **kwargs: object
-    ) -> Optional[MultiModalEmbeddings]:
+    ) -> MultiModalEmbeddings | None:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return None
@@ -1657,9 +1657,9 @@ class Ernie4_5_VLMoeForConditionalGeneration(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         if multimodal_embeddings is not None and len(multimodal_embeddings) > 0:
@@ -1680,8 +1680,8 @@ class Ernie4_5_VLMoeForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ):
         forward_kwargs = {
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index 2c49895561409..ace7e333e2137 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -25,7 +25,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -74,7 +74,7 @@ logger = init_logger(__name__)
 
 
 class Ernie4_5_VLMoeMLP(Ernie4_5_MoeMLP):
-    def __init__(self, shared_experts: Optional[torch.nn.Module] = None, **kwargs):
+    def __init__(self, shared_experts: torch.nn.Module | None = None, **kwargs):
         super().__init__(**kwargs)
         self.shared_experts = shared_experts
 
@@ -91,15 +91,15 @@ class Ernie4_5_VLMoeAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         rope_theta: float = 500000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         freq_allocation: int = 20,
         max_position_embeddings: int = 131072,
         rms_norm_eps: float = 1e-05,
         qkv_bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -192,7 +192,7 @@ class Ernie4_5_VLMoeMoE(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -390,8 +390,8 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -463,8 +463,8 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-        visual_token_mask: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
+        visual_token_mask: torch.Tensor | None,
         **kwargs: object,
     ) -> torch.Tensor:
         # Self Attention
@@ -551,11 +551,11 @@ class Ernie4_5_VLMoeModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        visual_token_mask: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        visual_token_mask: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -632,10 +632,10 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs
         )
@@ -644,7 +644,7 @@ class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py
index 46a7131f2499a..e7036840388cc 100644
--- a/vllm/model_executor/models/ernie_mtp.py
+++ b/vllm/model_executor/models/ernie_mtp.py
@@ -24,7 +24,6 @@
 """Inference-only Ernie-MTP model."""
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -121,7 +120,7 @@ class ErnieMultiTokenPredictor(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         if inputs_embeds is None:
@@ -169,8 +168,8 @@ class ErnieMTP(nn.Module, SupportsPP):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         assert spec_step_idx == 0, "ernie_mtp only support predict one token"
@@ -183,7 +182,7 @@ class ErnieMTP(nn.Module, SupportsPP):
         self,
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.model.compute_logits(hidden_states, self.lm_head, spec_step_idx)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 1f0b5723721c6..84fb52d138545 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -27,7 +27,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -75,7 +75,7 @@ class ExaoneGatedMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
@@ -115,11 +115,11 @@ class ExaoneAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -209,11 +209,11 @@ class ExaoneBlockAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -246,8 +246,8 @@ class ExaoneDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -296,7 +296,7 @@ class ExaoneDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -369,11 +369,11 @@ class ExaoneModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -536,9 +536,9 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         model_output = self.transformer(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -547,7 +547,7 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 230a2c80104b1..d5e4d9a1486f7 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -23,7 +23,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -72,7 +72,7 @@ class Exaone4GatedMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
@@ -112,11 +112,11 @@ class Exaone4Attention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 1000000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -222,8 +222,8 @@ class Exaone4DecoderLayer(nn.Module):
     def __init__(
         self,
         config: Exaone4Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -277,7 +277,7 @@ class Exaone4DecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         residual = hidden_states
 
@@ -356,11 +356,11 @@ class Exaone4Model(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -523,9 +523,9 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         model_output = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -534,7 +534,7 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 211a9120789e2..25429836b9ed6 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -23,7 +23,7 @@
 import math
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
+from typing import TypeAlias
 
 import torch
 from torch import nn
@@ -65,7 +65,7 @@ from .utils import (
     maybe_prefix,
 )
 
-FalconConfig = Union[HF_FalconConfig, RWConfig]
+FalconConfig: TypeAlias = HF_FalconConfig | RWConfig
 
 
 def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
@@ -95,8 +95,8 @@ class FalconAttention(nn.Module):
     def __init__(
         self,
         config: FalconConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -226,7 +226,7 @@ class FalconMLP(nn.Module):
     def __init__(
         self,
         config: FalconConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -265,8 +265,8 @@ class FalconDecoderLayer(nn.Module):
     def __init__(
         self,
         config: FalconConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -401,9 +401,9 @@ class FalconModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -517,8 +517,8 @@ class FalconForCausalLM(nn.Module, SupportsPP):
         self,
         input_ids: torch.LongTensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(
             input_ids, positions, intermediate_tensors, inputs_embeds
@@ -528,7 +528,7 @@ class FalconForCausalLM(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index db938dda5d637..4e0b6b52fc647 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -4,7 +4,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional
 
 import torch
 from torch import nn
@@ -52,7 +51,7 @@ class FalconH1MLP(nn.Module):
     def __init__(
         self,
         config: FalconH1Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
     ) -> None:
         super().__init__()
@@ -91,9 +90,9 @@ class FalconH1SSMDecoderLayer(nn.Module):
     def __init__(
         self,
         config: FalconH1Config,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -188,7 +187,7 @@ class FalconH1SSMDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         output = torch.empty_like(hidden_states)
@@ -204,8 +203,8 @@ class FalconH1AttentionDecoderLayer(nn.Module):
     def __init__(
         self,
         config: FalconH1Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -301,7 +300,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         hidden_states = self.self_attention(
@@ -326,9 +325,9 @@ class FalconH1ParallelHybrid(nn.Module):
         self,
         config: FalconH1Config,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -467,8 +466,8 @@ class FalconH1Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -610,8 +609,8 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, Is
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ):
         hidden_states = self.model(
@@ -626,7 +625,7 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, Is
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
 
         return logits
diff --git a/vllm/model_executor/models/flex_olmo.py b/vllm/model_executor/models/flex_olmo.py
index b1fbbf086896d..11d0949a798a1 100644
--- a/vllm/model_executor/models/flex_olmo.py
+++ b/vllm/model_executor/models/flex_olmo.py
@@ -14,8 +14,6 @@
 # limitations under the License.
 """Inference-only FlexOlmo model compatible with HuggingFace weights."""
 
-from typing import Optional
-
 import torch
 from torch import nn
 
@@ -128,8 +126,8 @@ class FlexOlmoDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         # Attention block.
         residual = hidden_states
         hidden_states = self.self_attn(positions, hidden_states)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 83572563c15ef..005fac4b1f05d 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -20,7 +20,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
@@ -87,7 +87,7 @@ class FuyuProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self, **kwargs: object) -> FuyuImageProcessor:
         return self.get_hf_processor(**kwargs).image_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": 1}
 
     def get_image_feature_grid_size(
@@ -142,7 +142,7 @@ class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         target_width, target_height = self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
@@ -271,7 +271,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -305,7 +305,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[FuyuImagePatchInputs]:
+    ) -> FuyuImagePatchInputs | None:
         image_patches = kwargs.pop("image_patches", None)
         patches_per_image = kwargs.pop("patches_per_image", None)
 
@@ -344,8 +344,8 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ):
         if intermediate_tensors is not None:
@@ -362,7 +362,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.language_model.logits_processor(
             self.language_model.lm_head, hidden_states
         )
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index b152f52223cf6..46b111f4d9396 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -20,7 +20,6 @@
 from collections.abc import Iterable
 from functools import cache
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -59,8 +58,8 @@ logger = init_logger(__name__)
 
 @cache
 def _get_gemma_act_fn(
-    hidden_act: Optional[str],
-    hidden_activation: Optional[str],
+    hidden_act: str | None,
+    hidden_activation: str | None,
 ) -> nn.Module:
     if hidden_activation is None:
         if hidden_act is not None:
@@ -92,9 +91,9 @@ class GemmaMLP(nn.Module):
         self,
         hidden_size: int,
         intermediate_size: int,
-        hidden_act: Optional[str] = None,
-        hidden_activation: Optional[str] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        hidden_act: str | None = None,
+        hidden_activation: str | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -130,8 +129,8 @@ class GemmaAttention(nn.Module):
         head_dim: int,
         max_position_embeddings: int = 8192,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -207,8 +206,8 @@ class GemmaDecoderLayer(nn.Module):
     def __init__(
         self,
         config: GemmaConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -241,7 +240,7 @@ class GemmaDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -301,9 +300,9 @@ class GemmaModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -406,9 +405,9 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -417,7 +416,7 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.model.embed_tokens, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 2d26edcf6609f..66c9b774f174d 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -18,7 +18,6 @@
 # limitations under the License.
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -66,7 +65,7 @@ class Gemma2MLP(nn.Module):
         intermediate_size: int,
         hidden_act: str,
         hidden_activation: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -100,9 +99,9 @@ class Gemma2Attention(nn.Module):
         head_dim: int,
         max_position_embeddings: int,
         rope_theta: float,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        attn_logits_soft_cap: Optional[float] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        attn_logits_soft_cap: float | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -183,8 +182,8 @@ class Gemma2DecoderLayer(nn.Module):
     def __init__(
         self,
         config: Gemma2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -225,7 +224,7 @@ class Gemma2DecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
             residual = hidden_states
@@ -284,11 +283,11 @@ class Gemma2Model(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -406,9 +405,9 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -417,7 +416,7 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.model.embed_tokens, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 7e6fc401757aa..80ec40f478c6d 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -66,7 +65,7 @@ class Gemma3MLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_activation: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -108,9 +107,9 @@ class Gemma3Attention(nn.Module):
         num_kv_heads: int,
         head_dim: int,
         max_position_embeddings: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        attn_logits_soft_cap: Optional[float] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        attn_logits_soft_cap: float | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -295,8 +294,8 @@ class Gemma3DecoderLayer(nn.Module):
     def __init__(
         self,
         config: Gemma3TextConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -336,7 +335,7 @@ class Gemma3DecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
@@ -401,12 +400,12 @@ class Gemma3Model(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -549,10 +548,10 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs
         )
@@ -561,7 +560,7 @@ class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.model.embed_tokens, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 95b0b0dab5a1e..7c628fe93ce36 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional
+from typing import Annotated, Any, Literal
 
 import torch
 from torch import nn
@@ -82,7 +82,7 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(Gemma3Processor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def _resolve_image_kwargs(
@@ -112,7 +112,7 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Gemma3Processor],
+        processor: Gemma3Processor | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -182,7 +182,7 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Gemma3Processor],
+        processor: Gemma3Processor | None,
     ) -> PromptUpdateDetails[str]:
         if processor is None:
             processor = self.get_hf_processor()
@@ -217,7 +217,7 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Gemma3Processor],
+        processor: Gemma3Processor | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -256,7 +256,7 @@ class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
@@ -510,7 +510,7 @@ class Gemma3ForConditionalGeneration(
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<start_of_image>"
 
@@ -555,7 +555,7 @@ class Gemma3ForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Gemma3ImageInputs]:
+    ) -> Gemma3ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -609,8 +609,8 @@ class Gemma3ForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
         if intermediate_tensors is not None:
@@ -692,7 +692,7 @@ class Gemma3ForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index e4ea4256ebc23..f7a732e3a601c 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -16,7 +16,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -196,7 +195,7 @@ class Gemma3nLaurelBlock(nn.Module):
         laurel_rank: int,
         rms_norm_eps: float,
         *,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str,
     ) -> None:
         super().__init__()
@@ -236,7 +235,7 @@ class Gemma3nMLP(nn.Module):
         intermediate_size: int,
         hidden_activation: str,
         activation_sparsity: float = 0.0,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -285,8 +284,8 @@ class Gemma3nAttention(nn.Module):
         num_kv_heads: int,
         head_dim: int,
         max_position_embeddings: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -412,8 +411,8 @@ class Gemma3nDecoderLayer(nn.Module):
     def __init__(
         self,
         config: Gemma3nTextConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -650,7 +649,7 @@ class Gemma3nSelfDecoder(nn.Module):
     def get_per_layer_inputs(
         self,
         hidden_states_0: torch.Tensor,
-        per_layer_inputs: Optional[torch.Tensor],
+        per_layer_inputs: torch.Tensor | None,
     ) -> torch.Tensor:
         per_layer_projection = self.per_layer_model_projection(hidden_states_0)
         per_layer_projection = per_layer_projection.reshape(
@@ -687,8 +686,8 @@ class Gemma3nSelfDecoder(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        per_layer_inputs: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
+        per_layer_inputs: torch.Tensor | None = None,
         **kwargs,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if inputs_embeds is not None:
@@ -870,8 +869,8 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        per_layer_inputs: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
+        per_layer_inputs: torch.Tensor | None = None,
         **kwargs,
     ) -> torch.Tensor:
         logits_indices_padded, num_logits_indices = None, None
@@ -947,8 +946,8 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        per_layer_inputs: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
+        per_layer_inputs: torch.Tensor | None = None,
         **kwargs,
     ) -> torch.Tensor:
         hidden_states, per_layer_inputs = self.self_decoder(
@@ -990,13 +989,13 @@ class Gemma3nTextModel(nn.Module, SupportsQuant):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        per_layer_inputs: Optional[torch.Tensor] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: torch.Tensor | None = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if self.fast_prefill_enabled:
             hidden_states = self.fast_prefill_forward(
                 input_ids,
@@ -1116,11 +1115,11 @@ class Gemma3nForCausalLM(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         *,
-        per_layer_inputs: Optional[torch.Tensor] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: torch.Tensor | None = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids,
             positions,
@@ -1134,7 +1133,7 @@ class Gemma3nForCausalLM(nn.Module):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.model.embed_tokens, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index f25f50602e6c2..d7fd2b109d24f 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -24,7 +24,6 @@
 """Inference-only GLM-4-0414 model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -56,12 +55,12 @@ class Glm4Attention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         qkv_bias: bool = False,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        rope_scaling: Optional[tuple] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -142,7 +141,7 @@ class Glm4DecoderLayer(nn.Module):
         self,
         vllm_config: VllmConfig,
         prefix: str = "",
-        config: Optional[Glm4Config] = None,
+        config: Glm4Config | None = None,
     ) -> None:
         super().__init__()
 
@@ -189,7 +188,7 @@ class Glm4DecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -285,9 +284,9 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -296,7 +295,7 @@ class Glm4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 304e721fade5b..6e58f8c32f8ad 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -27,9 +27,9 @@
 """Inference-only GLM-4V model compatible with HuggingFace weights."""
 
 import math
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import numpy as np
 import torch
@@ -140,7 +140,7 @@ class Glm4vImageEmbeddingInputs(TensorSchema):
     image_grid_thw: Annotated[torch.Tensor, TensorShape("n", 3)]
 
 
-Glm4vImageInputs = Union[Glm4vImagePixelInputs, Glm4vImageEmbeddingInputs]
+Glm4vImageInputs: TypeAlias = Glm4vImagePixelInputs | Glm4vImageEmbeddingInputs
 
 
 class Glm4vVideoPixelInputs(TensorSchema):
@@ -176,7 +176,7 @@ class Glm4vVideoEmbeddingInputs(TensorSchema):
     video_grid_thw: Annotated[torch.Tensor, TensorShape("f", 3)]
 
 
-Glm4vVideoInputs = Union[Glm4vVideoPixelInputs, Glm4vVideoEmbeddingInputs]
+Glm4vVideoInputs: TypeAlias = Glm4vVideoPixelInputs | Glm4vVideoEmbeddingInputs
 
 # ==== Vision Encoder ==== #
 
@@ -187,7 +187,7 @@ class Glm4vVisionMLP(nn.Module):
         in_features: int,
         hidden_features: int,
         bias: bool = False,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -244,7 +244,7 @@ class Glm4vVisionAttention(nn.Module):
         embed_dim: int,
         num_heads: int,
         projection_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -334,8 +334,8 @@ class Glm4vVisionAttention(nn.Module):
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-        seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -413,8 +413,8 @@ class Glm4vVisionBlock(nn.Module):
         dim: int,
         num_heads: int,
         mlp_hidden_dim: int,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -445,8 +445,8 @@ class Glm4vVisionBlock(nn.Module):
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-        seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
@@ -495,7 +495,7 @@ class Glm4vPatchMerger(nn.Module):
         self,
         d_model: int,
         context_dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
         use_data_parallel: bool = False,
@@ -693,7 +693,7 @@ class Glm4vVisionTransformer(nn.Module):
         self,
         vision_config: Glm4vVisionConfig,
         norm_eps: float = 1e-6,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -809,7 +809,7 @@ class Glm4vVisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[Optional[int], Optional[list[int]]]:
+    ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
         seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         if (
@@ -904,7 +904,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo):
     def get_tokenizer(self):
         return self.ctx.tokenizer
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": 1}
 
     def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor:
@@ -1141,7 +1141,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -1177,7 +1177,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
         height: int,
         num_frames: int,
         num_videos: int,
-        overrides: Optional[VideoDummyOptions] = None,
+        overrides: VideoDummyOptions | None = None,
     ) -> list[VideoItem]:
         if overrides:
             if overrides.num_frames:
@@ -1419,7 +1419,7 @@ class Glm4vForConditionalGeneration(
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|begin_of_image|><|image|><|end_of_image|>"
         if modality.startswith("video"):
@@ -1465,7 +1465,7 @@ class Glm4vForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Glm4vImageInputs]:
+    ) -> Glm4vImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -1489,7 +1489,7 @@ class Glm4vForConditionalGeneration(
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[Glm4vVideoInputs]:
+    ) -> Glm4vVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -1594,7 +1594,7 @@ class Glm4vForConditionalGeneration(
 
     def get_multimodal_embeddings(
         self, **kwargs: object
-    ) -> Optional[MultiModalEmbeddings]:
+    ) -> MultiModalEmbeddings | None:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return None
@@ -1619,10 +1619,10 @@ class Glm4vForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for GLM-4V.
 
         Args:
@@ -1652,7 +1652,7 @@ class Glm4vForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index b9cdee29417a6..a53f52852c6ad 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -26,7 +26,7 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -81,7 +81,7 @@ class Glm4MoeMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ) -> None:
@@ -118,7 +118,7 @@ class Glm4MoE(nn.Module):
     def __init__(
         self,
         config: Glm4MoeConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ):
@@ -234,14 +234,14 @@ class Glm4MoeAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 131072,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         rms_norm_eps: float = 1e-05,
         qkv_bias: bool = False,
         use_qk_norm: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -334,8 +334,8 @@ class Glm4MoeDecoderLayer(nn.Module):
     def __init__(
         self,
         config: Glm4MoeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ) -> None:
@@ -395,7 +395,7 @@ class Glm4MoeDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
             residual = hidden_states
@@ -462,9 +462,9 @@ class Glm4MoeModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -704,9 +704,9 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -715,7 +715,7 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
@@ -729,7 +729,7 @@ class Glm4MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
 
 def get_spec_layer_idx_from_weight_name(
     config: Glm4MoeConfig, weight_name: str
-) -> Optional[int]:
+) -> int | None:
     if hasattr(config, "num_nextn_predict_layers") and (
         config.num_nextn_predict_layers > 0
     ):
diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py
index beb40632246c0..9fb1be7ba45c4 100644
--- a/vllm/model_executor/models/glm4_moe_mtp.py
+++ b/vllm/model_executor/models/glm4_moe_mtp.py
@@ -24,7 +24,6 @@
 """Inference-only GLM-4.5 MTP model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -52,7 +51,7 @@ class SharedHead(nn.Module):
         self,
         config: PretrainedConfig,
         prefix: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -72,8 +71,8 @@ class Glm4MoeMultiTokenPredictorLayer(nn.Module):
         self,
         config: PretrainedConfig,
         prefix: str,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -94,7 +93,7 @@ class Glm4MoeMultiTokenPredictorLayer(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_index: int = 0,
     ) -> torch.Tensor:
         assert inputs_embeds is not None
@@ -149,7 +148,7 @@ class Glm4MoeMultiTokenPredictor(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         if inputs_embeds is None:
@@ -192,8 +191,8 @@ class Glm4MoeMTP(nn.Module, SupportsPP):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         hidden_states = self.model(
@@ -205,7 +204,7 @@ class Glm4MoeMTP(nn.Module, SupportsPP):
         self,
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.model.compute_logits(hidden_states, spec_step_idx)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 63731b2947d2d..1bad8b0405467 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -8,7 +8,7 @@
 import itertools
 from argparse import Namespace
 from collections.abc import Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 from torch import nn
@@ -109,7 +109,7 @@ class EVA2CLIPAttention(nn.Module):
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -152,7 +152,7 @@ class EVA2CLIPMLP(nn.Module):
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -182,7 +182,7 @@ class EVA2CLIPTransformerLayer(nn.Module):
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -211,7 +211,7 @@ class EVA2CLIPTransformer(nn.Module):
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -237,7 +237,7 @@ class EVA2CLIPGLU(nn.Module):
         self,
         config,
         in_features,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         """
@@ -317,7 +317,7 @@ class EVA2CLIPModel(nn.Module):
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -416,9 +416,9 @@ class GLM4VProcessor:
 
     def __call__(
         self,
-        text: Optional[Union[TextInput, list[TextInput]]] = None,
-        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: TextInput | list[TextInput] | None = None,
+        images: ImageInput | list[ImageInput] | None = None,
+        return_tensors: str | TensorType | None = None,
     ) -> BatchFeature:
         if text is None:
             text = []
@@ -458,7 +458,7 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
             **kwargs,
         )
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": 1}
 
     def get_num_image_tokens(self) -> int:
@@ -487,7 +487,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
@@ -578,7 +578,7 @@ class GLM4VForCausalLM(
         )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|begin_of_image|><|endoftext|><|end_of_image|>"
 
@@ -601,7 +601,7 @@ class GLM4VForCausalLM(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[GLMVImagePixelInputs]:
+    ) -> GLMVImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
 
         if pixel_values is not None:
@@ -624,12 +624,12 @@ class GLM4VForCausalLM(
         cls,
         input_tokens: list[int],
         hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        image_grid_thw: list[list[int]] | torch.Tensor,
+        video_grid_thw: list[list[int]] | torch.Tensor,
         context_len: int = 0,
-        seq_len: Optional[int] = None,
-        second_per_grid_ts: Optional[list[float]] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
+        seq_len: int | None = None,
+        second_per_grid_ts: list[float] | None = None,
+        audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
         """Get mrope input positions and delta value for GLM4V."""
@@ -780,10 +780,10 @@ class GLM4VForCausalLM(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 53d6026c5938e..4cafe724f1caa 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -22,7 +22,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -65,8 +64,8 @@ class GPT2Attention(nn.Module):
     def __init__(
         self,
         config: GPT2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -118,7 +117,7 @@ class GPT2MLP(nn.Module):
         self,
         intermediate_size: int,
         config: GPT2Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -150,8 +149,8 @@ class GPT2Block(nn.Module):
     def __init__(
         self,
         config: GPT2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -221,9 +220,9 @@ class GPT2Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor],
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
                 inputs_embeds = self.get_input_embeddings(input_ids)
@@ -301,9 +300,9 @@ class GPT2LMHeadModel(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.transformer(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -312,7 +311,7 @@ class GPT2LMHeadModel(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
@@ -367,8 +366,8 @@ class GPT2ForSequenceClassification(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(
             input_ids=input_ids,
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index b6d3d8f3f2e60..f2c8e2aeb8225 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -23,7 +23,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -62,8 +61,8 @@ class GPTBigCodeAttention(nn.Module):
     def __init__(
         self,
         config: GPTBigCodeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -133,7 +132,7 @@ class GPTBigMLP(nn.Module):
         self,
         intermediate_size: int,
         config: GPTBigCodeConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -165,8 +164,8 @@ class GPTBigCodeBlock(nn.Module):
     def __init__(
         self,
         config: GPTBigCodeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -243,9 +242,9 @@ class GPTBigCodeModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
                 inputs_embeds = self.get_input_embeddings(input_ids)
@@ -326,9 +325,9 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.transformer(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -337,7 +336,7 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 5428512dec195..1777fd3583c39 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -21,7 +21,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -64,8 +63,8 @@ class GPTJAttention(nn.Module):
     def __init__(
         self,
         config: GPTJConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -130,7 +129,7 @@ class GPTJMLP(nn.Module):
         self,
         intermediate_size: int,
         config: GPTJConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         hidden_size = config.n_embd
@@ -157,8 +156,8 @@ class GPTJBlock(nn.Module):
     def __init__(
         self,
         config: GPTJConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -218,9 +217,9 @@ class GPTJModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -322,9 +321,9 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.transformer(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -333,7 +332,7 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states, self.lm_head.bias)
         return logits
 
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 8278ae03d88a5..2f638acaa2b66 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -21,7 +21,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -61,8 +60,8 @@ class GPTNeoXAttention(nn.Module):
     def __init__(
         self,
         config: GPTNeoXConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -125,7 +124,7 @@ class GPTNeoXMLP(nn.Module):
     def __init__(
         self,
         config: GPTNeoXConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         self.dense_h_to_4h = ColumnParallelLinear(
@@ -151,8 +150,8 @@ class GPTNeoXLayer(nn.Module):
     def __init__(
         self,
         config: GPTNeoXConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -232,9 +231,9 @@ class GPTNeoXModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -320,9 +319,9 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.gpt_neox(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -331,7 +330,7 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.embed_out, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 17f9114350798..fcba9b8e66c29 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.distributed as dist
@@ -49,8 +48,8 @@ class OAIAttention(nn.Module):
     def __init__(
         self,
         config: GptOssConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
+        quant_config: QuantizationConfig | None = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -208,7 +207,7 @@ class TransformerBlock(torch.nn.Module):
         self,
         hidden_states: torch.Tensor,
         positions: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
@@ -260,8 +259,8 @@ class GptOssModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -687,8 +686,8 @@ class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return self.model(input_ids, positions, intermediate_tensors, inputs_embeds)
 
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index e9bc592c0797b..5fc8718ca75e5 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -26,7 +26,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -73,7 +73,7 @@ class GraniteMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
@@ -113,11 +113,11 @@ class GraniteAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -197,8 +197,8 @@ class GraniteDecoderLayer(nn.Module):
     def __init__(
         self,
         config: GraniteConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -323,11 +323,11 @@ class GraniteModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -480,15 +480,15 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         model_output = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
         return model_output
 
-    def compute_logits(self, hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index 82bceaf3ed019..043b1406bd371 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -26,7 +26,7 @@
 
 import math
 from collections.abc import Iterable, Mapping
-from typing import Annotated, Optional, Union
+from typing import Annotated
 
 import torch
 import torch.nn.functional as F
@@ -92,7 +92,7 @@ class GraniteSpeechAudioInputs(TensorSchema):
 
 
 class GraniteSpeechMultiModalProcessingInfo(BaseProcessingInfo):
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": 1}
 
     # There is no limit to the maximum number of audio tokens that can be
@@ -196,7 +196,7 @@ class GraniteSpeechDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
         audio_overrides = mm_options.get("audio") if mm_options else None
@@ -222,7 +222,7 @@ class GraniteSpeechEncoderProjector(nn.Module):
         self,
         config: PretrainedConfig,
         cache_config: CacheConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -279,7 +279,7 @@ class GraniteSpeechConformerFeedForward(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -479,7 +479,7 @@ class GraniteSpeechCTCEncoder(nn.Module):
         self,
         config: PretrainedConfig,
         prefix: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         self.config = config
@@ -561,7 +561,7 @@ class GraniteSpeechForConditionalGeneration(
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("audio"):
             return "<|audio|>"
 
@@ -606,7 +606,7 @@ class GraniteSpeechForConditionalGeneration(
     def _parse_and_validate_audio_input(
         self,
         **kwargs: object,
-    ) -> Optional[GraniteSpeechAudioInputs]:
+    ) -> GraniteSpeechAudioInputs | None:
         input_features = kwargs.pop("input_features", None)
         input_features_mask = kwargs.pop("input_features_mask", None)
         audio_embed_sizes = kwargs.pop("audio_embed_sizes", None)
@@ -763,9 +763,9 @@ class GraniteSpeechForConditionalGeneration(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         # Multi-modal token ID may exceed vocab size
         handle_oov_mm_token: bool = True,
     ) -> torch.Tensor:
@@ -784,10 +784,10 @@ class GraniteSpeechForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -799,7 +799,7 @@ class GraniteSpeechForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 4711ed05c5879..e683f30805f37 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -26,7 +26,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -79,9 +79,9 @@ class GraniteMoeMoE(nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        tp_size: Optional[int] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
         is_sequence_parallel=False,
         prefix: str = "",
     ):
@@ -143,10 +143,10 @@ class GraniteMoeAttention(nn.Module):
         num_kv_heads: int,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        attention_multiplier: Optional[float] = None,
+        rope_scaling: dict[str, Any] | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        attention_multiplier: float | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -330,8 +330,8 @@ class GraniteMoeModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -557,15 +557,15 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
         return hidden_states
 
-    def compute_logits(self, hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index f877dc5764275..14d3a46e54af5 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -4,7 +4,6 @@
 
 # Added by the IBM Team, 2025
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
@@ -50,9 +49,9 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
         self,
         config: GraniteMoeHybridConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -105,7 +104,7 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         residual = hidden_states
@@ -139,9 +138,9 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
         self,
         config: GraniteMoeHybridConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -183,7 +182,7 @@ class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
@@ -218,9 +217,9 @@ class GraniteMoeHybridAttention(nn.Module):
     def __init__(
         self,
         config: GraniteMoeHybridConfig,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -374,8 +373,8 @@ class GraniteMoeHybridModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -614,8 +613,8 @@ class GraniteMoeHybridForCausalLM(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ):
         hidden_states = self.model(
@@ -627,7 +626,7 @@ class GraniteMoeHybridForCausalLM(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index 93302821ca68d..e222109f2a949 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -8,7 +8,6 @@ experts.
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional
 
 import torch
 from torch import nn
@@ -41,7 +40,7 @@ class GraniteMoeSharedMLP(nn.Module):
     def __init__(
         self,
         config: GraniteMoeSharedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -80,8 +79,8 @@ class GraniteMoeSharedDecoderLayer(nn.Module):
     def __init__(
         self,
         config: GraniteMoeSharedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -198,8 +197,8 @@ class GraniteMoeSharedModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -321,15 +320,15 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
         return hidden_states
 
-    def compute_logits(self, hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index ac78dd9e753aa..756a3900965b0 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Set
-from typing import Optional, Union
 
 import numpy as np
 import torch
@@ -62,7 +61,7 @@ class GritLMMeanPool(nn.Module):
         arr: np.ndarray,
         target: np.ndarray,
         start_idx: int = 0,
-        end_idx: Optional[int] = None,
+        end_idx: int | None = None,
     ) -> int:
         """
         Find the first occurrence of `target` in `arr` starting from
@@ -152,8 +151,8 @@ class GritLMMeanPool(nn.Module):
     def forward_one(
         self,
         hidden_states: torch.Tensor,
-        prompt_len: Optional[torch.Tensor] = None,
-        instr_len: Optional[torch.Tensor] = None,
+        prompt_len: torch.Tensor | None = None,
+        instr_len: torch.Tensor | None = None,
     ) -> torch.Tensor:
         assert prompt_len is None or prompt_len == hidden_states.shape[0], (
             "partial prefill not supported with MEAN pooling"
@@ -166,7 +165,7 @@ class GritLMMeanPool(nn.Module):
         hidden_states: torch.Tensor,
         prompt_lens: torch.Tensor,
         instr_lens: torch.Tensor,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         offset = 0
         pooled_data = list[torch.Tensor]()
 
@@ -182,9 +181,9 @@ class GritLMMeanPool(nn.Module):
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
-    ) -> Union[list[torch.Tensor], torch.Tensor]:
+    ) -> list[torch.Tensor] | torch.Tensor:
         prompt_lens = get_prompt_lens(hidden_states, pooling_metadata)
         instr_lens = torch.tensor(
             [
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index f4139685b79f6..d77a0bc2993a0 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -25,7 +25,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -86,9 +85,9 @@ class Grok1MoE(nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        tp_size: Optional[int] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -137,8 +136,8 @@ class Grok1Attention(nn.Module):
         num_kv_heads: int,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         config=None,  # Added config parameter
     ) -> None:
@@ -223,8 +222,8 @@ class Grok1DecoderLayer(nn.Module):
     def __init__(
         self,
         config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -273,7 +272,7 @@ class Grok1DecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -351,9 +350,9 @@ class Grok1Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -544,9 +543,9 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -555,7 +554,7 @@ class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index d7ee0fd8fd37c..81c6b34bd6ce0 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -9,7 +9,6 @@
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Mapping, Sequence
-from typing import Optional, Union
 
 import torch
 from PIL import Image
@@ -67,7 +66,7 @@ def get_h2ovl_target_ratios(
     min_num: int,
     max_num: int,
     *,
-    prior_aspect_ratio: Optional[tuple[int, int]],
+    prior_aspect_ratio: tuple[int, int] | None,
 ) -> list[tuple[int, int]]:
     target_ratios = get_internvl_target_ratios(min_num, max_num)
 
@@ -170,7 +169,7 @@ def _preprocess_image(
     min_num: int,
     max_num: int,
     use_thumbnail: bool,
-    prior_aspect_ratio: Optional[tuple[int, int]],
+    prior_aspect_ratio: tuple[int, int] | None,
 ) -> tuple[torch.Tensor, tuple[int, int]]:
     target_ratios = get_h2ovl_target_ratios(
         min_num,
@@ -244,10 +243,10 @@ class H2OVLProcessor(BaseInternVLProcessor):
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_msac: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_msac: bool | None = None,
     ) -> None:
         super().__init__(
             config,
@@ -270,7 +269,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
@@ -280,10 +279,10 @@ class H2OVLProcessor(BaseInternVLProcessor):
     def resolve_min_max_num(
         self,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_thumbnail: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
     ) -> tuple[int, int]:
         min_dynamic_patch = (
             self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
@@ -308,12 +307,12 @@ class H2OVLProcessor(BaseInternVLProcessor):
     def resolve_target_ratios(
         self,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_thumbnail: Optional[bool] = None,
-        prior_aspect_ratio: Optional[tuple[int, int]] = None,
-        override_min_num: Optional[int] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
+        prior_aspect_ratio: tuple[int, int] | None = None,
+        override_min_num: int | None = None,
     ) -> list[tuple[int, int]]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -335,7 +334,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
         *,
         image_width: int,
         image_height: int,
-        use_msac: Optional[bool] = None,
+        use_msac: bool | None = None,
     ) -> int:
         use_msac = self.use_msac if use_msac is None else use_msac
 
@@ -385,9 +384,9 @@ class H2OVLProcessor(BaseInternVLProcessor):
     def _images_to_pixel_values_lst(
         self,
         images: list[Image.Image],
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> list[torch.Tensor]:
         use_msac = self.use_msac if len(images) == 1 else False
 
@@ -425,8 +424,8 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[H2OVLProcessor],
-        use_msac: Optional[bool] = None,
+        processor: H2OVLProcessor | None,
+        use_msac: bool | None = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -493,11 +492,11 @@ class H2OVLMultiModalProcessor(BaseInternVLMultiModalProcessor[H2OVLProcessingIn
 
     def _cached_apply_hf_processor(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 1 vs > 1
         # Since the processing cache assumes that the processor output is
@@ -530,7 +529,7 @@ class H2OVLChatModel(InternVLChatModel):
     def _init_vision_model(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         is_mono: bool,
         prefix: str,
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index cf2e5d0d0bd6e..901f29310872b 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -27,7 +27,7 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import regex as re
 import torch
@@ -102,7 +102,7 @@ class HunYuanMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
         reduce_results: bool = True,
@@ -144,11 +144,11 @@ class HunYuanAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
         layer_id: int = -1,
     ) -> None:
@@ -227,7 +227,7 @@ class HunYuanAttention(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_states: Optional[tuple[torch.Tensor]] = None,
+        kv_states: tuple[torch.Tensor] | None = None,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
@@ -256,11 +256,11 @@ class HunYuanCrossAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
         layer_id: int = -1,
     ) -> None:
@@ -338,7 +338,7 @@ class HunYuanCrossAttention(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_states: Optional[tuple[torch.Tensor]] = None,
+        kv_states: tuple[torch.Tensor] | None = None,
     ) -> torch.Tensor:
         assert kv_states is not None
         ori_k, v = kv_states  # use last layer kv,
@@ -365,7 +365,7 @@ class HunYuanSparseMoeBlock(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         layer_id: int = -1,
         prefix: str = "",
         enable_eplb: bool = False,
@@ -480,8 +480,8 @@ class HunYuanDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         layer_id: int = -1,
         enable_eplb: bool = False,
@@ -577,8 +577,8 @@ class HunYuanDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-        kv_states: Optional[tuple[torch.Tensor]] = None,
+        residual: torch.Tensor | None,
+        kv_states: tuple[torch.Tensor] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -654,11 +654,11 @@ class HunYuanModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -960,9 +960,9 @@ class HunyuanV1ModelBase(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         model_output = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -971,7 +971,7 @@ class HunyuanV1ModelBase(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index 611c14733c71f..ad39443f93daa 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -6,7 +6,7 @@ from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
 from itertools import accumulate
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal
 
 import numpy as np
 import torch
@@ -115,13 +115,13 @@ class HCXVisionProcessingInfo(BaseProcessingInfo):
     def get_vision_encoder_info(self):
         return get_vision_encoder_info(self.get_hf_config())
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": None}
 
     def get_num_image_tokens(
         self,
         *,
-        vision_query_length: Union[int, list[int]],
+        vision_query_length: int | list[int],
     ) -> int:
         if isinstance(vision_query_length, int):
             return vision_query_length
@@ -131,7 +131,7 @@ class HCXVisionProcessingInfo(BaseProcessingInfo):
     def get_num_video_tokens(
         self,
         *,
-        vision_query_length: Union[int, list[int]],
+        vision_query_length: int | list[int],
     ) -> int:
         if isinstance(vision_query_length, int):
             return vision_query_length
@@ -166,7 +166,7 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -346,7 +346,7 @@ def _build_hcxvision_hf_processor(
     info: HCXVisionProcessingInfo,
     dummy_inputs: BaseDummyInputsBuilder[HCXVisionProcessingInfo],
     *,
-    cache: Optional[BaseMultiModalProcessorCache] = None,
+    cache: BaseMultiModalProcessorCache | None = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, HCXVisionProcessingInfo):
         return HCXVisionMultiModalProcessor(
@@ -360,12 +360,12 @@ def _build_hcxvision_hf_processor(
 
 def init_vision_tower_for_hcxvision(
     vision_config,
-    quant_config: Optional[QuantizationConfig],
+    quant_config: QuantizationConfig | None,
     *,
-    use_nth_layer: Optional[int] = None,
-    require_post_norm: Optional[bool] = None,
+    use_nth_layer: int | None = None,
+    require_post_norm: bool | None = None,
     prefix: str = "",
-) -> Union[CLIPVisionModel, SiglipVisionModel]:
+) -> CLIPVisionModel | SiglipVisionModel:
     num_hidden_layers = vision_config.num_hidden_layers
     if not isinstance(use_nth_layer, int):
         pass
@@ -473,8 +473,8 @@ class HCXVisionCAbstractor(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        num_queries_vis_abstractors: Optional[list[list[int]]] = None,
-        num_grids: Optional[list[int]] = None,
+        num_queries_vis_abstractors: list[list[int]] | None = None,
+        num_grids: list[int] | None = None,
     ) -> torch.Tensor:
         if self.prenorm is not None:
             x = self.prenorm(x)
@@ -493,8 +493,8 @@ class HCXVisionCAbstractor(nn.Module):
     def _forward(
         self,
         x: torch.Tensor,
-        num_queries_vis_abstractors: Optional[list[list[int]]] = None,
-        num_grids: Optional[list[int]] = None,
+        num_queries_vis_abstractors: list[list[int]] | None = None,
+        num_grids: list[int] | None = None,
     ) -> torch.Tensor:
         # x: [B, L, dim]
         B, L, dim = x.shape
@@ -515,8 +515,8 @@ class HCXVisionCAbstractor(nn.Module):
     def _forward_adaptive_num_query(
         self,
         x: torch.Tensor,
-        num_queries_vis_abstractors: Optional[list[list[int]]] = None,
-        num_grids: Optional[list[int]] = None,
+        num_queries_vis_abstractors: list[list[int]] | None = None,
+        num_grids: list[int] | None = None,
     ) -> list[torch.Tensor]:
         # self.net is consisted by 3 layers (s1, sampler, s2)
         assert len(self.net) == 3
@@ -604,7 +604,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         *,
         vllm_config: VllmConfig,
         prefix: str = "",
-        **kwargs: Optional[Any],
+        **kwargs: Any | None,
     ) -> None:
         super().__init__()
 
@@ -662,7 +662,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         # self.reduction = self._init_reduction_type(use_sum_loss)
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return IMAGE_TOKEN
         if modality.startswith("video"):
@@ -673,7 +673,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def _parse_and_validate_image_input(
         self,
         **kwargs: object,
-    ) -> Optional[HCXVisionImageInputs]:
+    ) -> HCXVisionImageInputs | None:
         pixel_values_images = kwargs.pop("pixel_values_images", None)
 
         if pixel_values_images is None:
@@ -689,7 +689,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def _parse_and_validate_video_input(
         self,
         **kwargs: object,
-    ) -> Optional[HCXVisionVideoInputs]:
+    ) -> HCXVisionVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
 
         if pixel_values_videos is None:
@@ -762,10 +762,10 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -946,7 +946,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(
@@ -1062,7 +1062,7 @@ def select_best_resolution(original_size: tuple, possible_resolutions: list) ->
 
 def get_anyres_image_grid_shape(
     image_size: tuple[int, int],
-    grid_pinpoints: Union[str, list[tuple[int, int]]],
+    grid_pinpoints: str | list[tuple[int, int]],
     patch_size: int,
 ) -> tuple[int, int]:
     possible_resolutions = (
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 02c46a11a1798..727c8ec0397ca 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -19,7 +19,6 @@
 """PyTorch Idefics2 model."""
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
@@ -77,7 +76,7 @@ class Idefics2VisionEmbeddings(nn.Module):
         self,
         pixel_values: torch.FloatTensor,
         patch_attention_mask: torch.BoolTensor,
-        tgt_sizes: Optional[torch.IntTensor] = None,
+        tgt_sizes: torch.IntTensor | None = None,
     ) -> torch.Tensor:
         batch_size, _, max_im_h, max_im_w = pixel_values.shape
         target_dtype = self.patch_embedding.weight.dtype
@@ -124,7 +123,7 @@ class Idefics2VisionAttention(nn.Module):
     def __init__(
         self,
         config: Idefics2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -185,7 +184,7 @@ class Idefics2VisionMLP(nn.Module):
     def __init__(
         self,
         config: Idefics2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -220,7 +219,7 @@ class Idefics2EncoderLayer(nn.Module):
     def __init__(
         self,
         config: Idefics2Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -275,9 +274,9 @@ class Idefics2Encoder(nn.Module):
     def __init__(
         self,
         config: Idefics2Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -326,9 +325,9 @@ class Idefics2VisionTransformer(nn.Module):
     def __init__(
         self,
         config: Idefics2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         require_post_norm: bool = True,
         prefix: str = "",
         use_data_parallel: bool = False,
@@ -370,8 +369,8 @@ class Idefics2VisionTransformer(nn.Module):
     def forward(
         self,
         pixel_values,
-        patch_attention_mask: Optional[torch.BoolTensor] = None,
-        tgt_sizes: Optional[torch.IntTensor] = None,
+        patch_attention_mask: torch.BoolTensor | None = None,
+        tgt_sizes: torch.IntTensor | None = None,
     ) -> torch.Tensor:
         hidden_states = self.embeddings(
             pixel_values=pixel_values,
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index effdbdc1ac384..06ca8c4886341 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -18,7 +18,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 from torch import nn
@@ -91,14 +91,14 @@ class Idefics3ImageEmbeddingInputs(TensorSchema):
     data: Annotated[torch.Tensor, TensorShape("bn", "f", "h")]
 
 
-ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
+ImageInputs: TypeAlias = Idefics3ImagePixelInputs | Idefics3ImageEmbeddingInputs
 
 
 class Idefics3ProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> Idefics3Processor:
         return self.ctx.get_hf_processor(Idefics3Processor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def _resize_output_size(
@@ -106,9 +106,9 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         height: int,
         width: int,
-        max_len: Optional[int] = None,
+        max_len: int | None = None,
         min_len: int = 1,
-        max_size: Optional[int] = None,
+        max_size: int | None = None,
     ) -> tuple[int, int]:
         # Set default value for max_len if not provided
         max_len = max(height, width) if max_len is None else max_len
@@ -165,7 +165,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Idefics3Processor],
+        processor: Idefics3Processor | None,
     ) -> tuple[int, int]:
         if processor is None:
             processor = self.get_hf_processor()
@@ -197,7 +197,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Idefics3Processor],
+        processor: Idefics3Processor | None,
     ) -> int:
         grid_w, grid_h = self._get_image_feature_grid_size(
             image_width=image_width,
@@ -208,7 +208,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         return grid_w * grid_h + 1
 
     def _get_image_token(
-        self, processor: Optional[Idefics3Processor]
+        self, processor: Idefics3Processor | None
     ) -> tuple[str, str, str]:
         if processor is None:
             processor = self.get_hf_processor()
@@ -223,7 +223,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Idefics3Processor],
+        processor: Idefics3Processor | None,
     ) -> str:
         if processor is None:
             processor = self.get_hf_processor()
@@ -269,7 +269,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Idefics3Processor],
+        processor: Idefics3Processor | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -305,7 +305,7 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo])
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         hf_processor = self.info.get_hf_processor()
@@ -425,7 +425,7 @@ class Idefics3SimpleMLP(nn.Module):
     def __init__(
         self,
         config: Idefics3Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -448,7 +448,7 @@ class Idefics3Connector(nn.Module):
     def __init__(
         self,
         config: Idefics3Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -557,9 +557,9 @@ class Idefics3Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.text_model(
             input_ids,
             positions,
@@ -590,7 +590,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLo
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -621,9 +621,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLo
             self.lm_head.weight = self.model.text_model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
 
-    def _parse_and_validate_image_input(
-        self, **kwargs: object
-    ) -> Optional[ImageInputs]:
+    def _parse_and_validate_image_input(self, **kwargs: object) -> ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -663,7 +661,7 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLo
     def _process_image_input(
         self,
         image_input: ImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -687,10 +685,10 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLo
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 68915d60ef480..5137cc261cc45 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1,15 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Iterable, Mapping, MutableSequence
+from collections.abc import Callable, Iterable, Mapping, MutableSequence
 from typing import (
     TYPE_CHECKING,
-    Callable,
     ClassVar,
     Literal,
-    Optional,
     Protocol,
-    Union,
+    TypeAlias,
     overload,
     runtime_checkable,
 )
@@ -34,10 +32,14 @@ if TYPE_CHECKING:
     from vllm.config import VllmConfig
     from vllm.model_executor.models.utils import WeightsMapper
     from vllm.sequence import IntermediateTensors
+else:
+    VllmConfig = object
+    WeightsMapper = object
+    IntermediateTensors = object
 
 logger = init_logger(__name__)
 
-MultiModalEmbeddings = Union[list[Tensor], Tensor, tuple[Tensor, ...]]
+MultiModalEmbeddings: TypeAlias = list[Tensor] | Tensor | tuple[Tensor, ...]
 """
 The output embeddings must be one of the following formats:
 
@@ -79,7 +81,7 @@ class SupportsMultiModal(Protocol):
     """
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         """
         Get the placeholder text for the `i`th `modality` item in the prompt.
         """
@@ -127,7 +129,7 @@ class SupportsMultiModal(Protocol):
         input_ids: Tensor,
         get_input_embeddings: Callable[[Tensor], Tensor],
         *,
-        is_multimodal: Optional[Tensor],
+        is_multimodal: Tensor | None,
         handle_oov_mm_token: bool,
     ) -> Tensor:
         if handle_oov_mm_token and is_multimodal is not None:
@@ -145,9 +147,9 @@ class SupportsMultiModal(Protocol):
     def get_input_embeddings(
         self,
         input_ids: Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[Tensor] = None,
+        is_multimodal: Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> Tensor:
         """
@@ -236,16 +238,16 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: ...
 
 
 def supports_multimodal(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsMultiModal]] | TypeIs[SupportsMultiModal]:
     return getattr(model, "supports_multimodal", False)
 
 
-def supports_multimodal_raw_input_only(model: Union[type[object], object]) -> bool:
+def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
     return getattr(model, "supports_multimodal_raw_input_only", False)
 
 
-def supports_multimodal_encoder_tp_data(model: Union[type[object], object]) -> bool:
+def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool:
     return getattr(model, "supports_encoder_tp_data", False)
 
 
@@ -260,8 +262,8 @@ def supports_multimodal_pruning(model: object) -> TypeIs[SupportsMultiModalPruni
 
 
 def supports_multimodal_pruning(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsMultiModalPruning]], TypeIs[SupportsMultiModalPruning]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsMultiModalPruning]] | TypeIs[SupportsMultiModalPruning]:
     return getattr(model, "supports_multimodal_pruning", False)
 
 
@@ -279,7 +281,7 @@ class SupportsScoreTemplate(Protocol):
     """
 
     @classmethod
-    def get_score_template(cls, query: str, document: str) -> Optional[str]:
+    def get_score_template(cls, query: str, document: str) -> str | None:
         """
         Generate a full prompt by populating the score template with query and document content.
         """  # noqa: E501
@@ -304,8 +306,8 @@ def supports_score_template(model: object) -> TypeIs[SupportsScoreTemplate]: ...
 
 
 def supports_score_template(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsScoreTemplate]], TypeIs[SupportsScoreTemplate]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsScoreTemplate]] | TypeIs[SupportsScoreTemplate]:
     return getattr(model, "supports_score_template", False)
 
 
@@ -348,8 +350,8 @@ def supports_lora(model: object) -> TypeIs[SupportsLoRA]: ...
 
 
 def supports_lora(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsLoRA]], TypeIs[SupportsLoRA]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsLoRA]] | TypeIs[SupportsLoRA]:
     result = _supports_lora(model)
 
     if not result:
@@ -379,7 +381,7 @@ def supports_lora(
     return result
 
 
-def _supports_lora(model: Union[type[object], object]) -> bool:
+def _supports_lora(model: type[object] | object) -> bool:
     if isinstance(model, type):
         return isinstance(model, _SupportsLoRAType)
 
@@ -404,15 +406,15 @@ class SupportsPP(Protocol):
         batch_size: int,
         dtype: torch.dtype,
         device: torch.device,
-    ) -> "IntermediateTensors":
+    ) -> IntermediateTensors:
         """Called when PP rank > 0 for profiling purposes."""
         ...
 
     def forward(
         self,
         *,
-        intermediate_tensors: Optional["IntermediateTensors"],
-    ) -> Union[Tensor, "IntermediateTensors"]:
+        intermediate_tensors: IntermediateTensors | None,
+    ) -> IntermediateTensors | None:
         """
         Accept [`IntermediateTensors`][vllm.sequence.IntermediateTensors] when
         PP rank > 0.
@@ -434,13 +436,13 @@ class _SupportsPPType(Protocol):
         batch_size: int,
         dtype: torch.dtype,
         device: torch.device,
-    ) -> "IntermediateTensors": ...
+    ) -> IntermediateTensors: ...
 
     def forward(
         self,
         *,
-        intermediate_tensors: Optional["IntermediateTensors"],
-    ) -> Union[Tensor, "IntermediateTensors"]: ...
+        intermediate_tensors: IntermediateTensors | None,
+    ) -> Tensor | IntermediateTensors: ...
 
 
 @overload
@@ -452,8 +454,8 @@ def supports_pp(model: object) -> TypeIs[SupportsPP]: ...
 
 
 def supports_pp(
-    model: Union[type[object], object],
-) -> Union[bool, TypeIs[type[SupportsPP]], TypeIs[SupportsPP]]:
+    model: type[object] | object,
+) -> bool | TypeIs[type[SupportsPP]] | TypeIs[SupportsPP]:
     supports_attributes = _supports_pp_attributes(model)
     supports_inspect = _supports_pp_inspect(model)
 
@@ -487,14 +489,14 @@ def supports_pp(
     return supports_attributes and supports_inspect
 
 
-def _supports_pp_attributes(model: Union[type[object], object]) -> bool:
+def _supports_pp_attributes(model: type[object] | object) -> bool:
     if isinstance(model, type):
         return isinstance(model, _SupportsPPType)
 
     return isinstance(model, SupportsPP)
 
 
-def _supports_pp_inspect(model: Union[type[object], object]) -> bool:
+def _supports_pp_inspect(model: type[object] | object) -> bool:
     model_forward = getattr(model, "forward", None)
     if not callable(model_forward):
         return False
@@ -523,8 +525,8 @@ def has_inner_state(model: type[object]) -> TypeIs[type[HasInnerState]]: ...
 
 
 def has_inner_state(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[HasInnerState]], TypeIs[HasInnerState]]:
+    model: type[object] | object,
+) -> TypeIs[type[HasInnerState]] | TypeIs[HasInnerState]:
     return getattr(model, "has_inner_state", False)
 
 
@@ -550,8 +552,8 @@ def is_attention_free(model: type[object]) -> TypeIs[type[IsAttentionFree]]: ...
 
 
 def is_attention_free(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[IsAttentionFree]], TypeIs[IsAttentionFree]]:
+    model: type[object] | object,
+) -> TypeIs[type[IsAttentionFree]] | TypeIs[IsAttentionFree]:
     return getattr(model, "is_attention_free", False)
 
 
@@ -570,7 +572,7 @@ class IsHybrid(Protocol):
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
-        vllm_config: "VllmConfig",
+        vllm_config: VllmConfig,
         use_v1: bool = True,
     ) -> tuple[tuple[int, int], tuple[int, int, int]]:
         """Calculate shapes for Mamba's convolutional and state caches.
@@ -596,8 +598,8 @@ def is_hybrid(model: type[object]) -> TypeIs[type[IsHybrid]]: ...
 
 
 def is_hybrid(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[IsHybrid]], TypeIs[IsHybrid]]:
+    model: type[object] | object,
+) -> TypeIs[type[IsHybrid]] | TypeIs[IsHybrid]:
     return getattr(model, "is_hybrid", False)
 
 
@@ -688,8 +690,8 @@ def has_noops(model: type[object]) -> TypeIs[type[HasNoOps]]: ...
 
 
 def has_noops(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[HasNoOps]], TypeIs[HasNoOps]]:
+    model: type[object] | object,
+) -> TypeIs[type[HasNoOps]] | TypeIs[HasNoOps]:
     return getattr(model, "has_noops", False)
 
 
@@ -711,23 +713,23 @@ def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]: ...
 
 
 def _supports_cross_encoding(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsCrossEncoding]] | TypeIs[SupportsCrossEncoding]:
     return getattr(model, "supports_cross_encoding", False)
 
 
 def supports_cross_encoding(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsCrossEncoding]] | TypeIs[SupportsCrossEncoding]:
     return is_pooling_model(model) and _supports_cross_encoding(model)
 
 
 class SupportsQuant:
     """The interface required for all models that support quantization."""
 
-    hf_to_vllm_mapper: ClassVar[Optional["WeightsMapper"]] = None
-    packed_modules_mapping: ClassVar[Optional[dict[str, list[str]]]] = None
-    quant_config: Optional[QuantizationConfig] = None
+    hf_to_vllm_mapper: ClassVar[WeightsMapper | None] = None
+    packed_modules_mapping: ClassVar[dict[str, list[str]] | None] = None
+    quant_config: QuantizationConfig | None = None
 
     def __new__(cls, *args, **kwargs) -> Self:
         instance = super().__new__(cls)
@@ -749,7 +751,7 @@ class SupportsQuant:
         return instance
 
     @staticmethod
-    def _find_quant_config(*args, **kwargs) -> Optional[QuantizationConfig]:
+    def _find_quant_config(*args, **kwargs) -> QuantizationConfig | None:
         """Find quant config passed through model constructor args"""
         from vllm.config import VllmConfig  # avoid circular import
 
@@ -797,10 +799,10 @@ class SupportsTranscription(Protocol):
         audio: np.ndarray,
         stt_config: SpeechToTextConfig,
         model_config: ModelConfig,
-        language: Optional[str],
+        language: str | None,
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
-        to_language: Optional[str],
+        to_language: str | None,
     ) -> PromptType:
         """Get the prompt for the ASR model.
         The model has control over the construction, as long as it
@@ -813,7 +815,7 @@ class SupportsTranscription(Protocol):
         return {k: v for k, v in LANGUAGES.items() if k not in cls.supported_languages}
 
     @classmethod
-    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+    def validate_language(cls, language: str | None) -> str | None:
         """
         Ensure the language specified in the transcription request
         is a valid ISO 639-1 language code. If the request language is
@@ -850,7 +852,7 @@ class SupportsTranscription(Protocol):
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
         model_config: ModelConfig,
-    ) -> Optional[int]:
+    ) -> int | None:
         """
         Map from audio duration to number of audio tokens produced by the ASR
         model, without running a forward pass.
@@ -870,8 +872,8 @@ def supports_transcription(model: object) -> TypeIs[SupportsTranscription]: ...
 
 
 def supports_transcription(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsTranscription]], TypeIs[SupportsTranscription]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsTranscription]] | TypeIs[SupportsTranscription]:
     return getattr(model, "supports_transcription", False)
 
 
@@ -891,8 +893,8 @@ def supports_v0_only(model: object) -> TypeIs[SupportsV0Only]: ...
 
 
 def supports_v0_only(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsV0Only]], TypeIs[SupportsV0Only]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsV0Only]] | TypeIs[SupportsV0Only]:
     return getattr(model, "supports_v0_only", False)
 
 
@@ -942,8 +944,8 @@ def supports_eagle3(model: object) -> TypeIs[SupportsEagle3]: ...
 
 
 def supports_eagle3(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsEagle3]], TypeIs[SupportsEagle3]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsEagle3]] | TypeIs[SupportsEagle3]:
     return isinstance(model, SupportsEagle3)
 
 
@@ -964,12 +966,12 @@ class SupportsMRoPE(Protocol):
         self,
         input_tokens: list[int],
         hf_config: PretrainedConfig,
-        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        second_per_grid_ts: Optional[list[float]] = None,
+        image_grid_thw: list[list[int]] | torch.Tensor | None,
+        video_grid_thw: list[list[int]] | torch.Tensor | None,
+        second_per_grid_ts: list[float] | None = None,
         context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
+        seq_len: int | None = None,
+        audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
         """
@@ -1007,6 +1009,6 @@ def supports_mrope(model: object) -> TypeIs[SupportsMRoPE]: ...
 
 
 def supports_mrope(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[SupportsMRoPE]], TypeIs[SupportsMRoPE]]:
+    model: type[object] | object,
+) -> TypeIs[type[SupportsMRoPE]] | TypeIs[SupportsMRoPE]:
     return isinstance(model, SupportsMRoPE)
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index b697eb25b5cc2..afb94f7c35467 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -5,9 +5,7 @@ from typing import (
     Any,
     ClassVar,
     Literal,
-    Optional,
     Protocol,
-    Union,
     overload,
     runtime_checkable,
 )
@@ -63,12 +61,12 @@ class VllmModel(Protocol[T_co]):
     ) -> T_co: ...
 
 
-def _check_vllm_model_init(model: Union[type[object], object]) -> bool:
+def _check_vllm_model_init(model: type[object] | object) -> bool:
     model_init = model.__init__
     return supports_kw(model_init, "vllm_config")
 
 
-def _check_vllm_model_get_input_embeddings(model: Union[type[object], object]) -> bool:
+def _check_vllm_model_get_input_embeddings(model: type[object] | object) -> bool:
     model_get_input_embeddings = getattr(model, "get_input_embeddings", None)
     if not callable(model_get_input_embeddings):
         logger.warning(
@@ -80,7 +78,7 @@ def _check_vllm_model_get_input_embeddings(model: Union[type[object], object]) -
     return True
 
 
-def _check_vllm_model_forward(model: Union[type[object], object]) -> bool:
+def _check_vllm_model_forward(model: type[object] | object) -> bool:
     model_forward = getattr(model, "forward", None)
     if not callable(model_forward):
         return False
@@ -108,8 +106,8 @@ def is_vllm_model(model: object) -> TypeIs[VllmModel]: ...
 
 
 def is_vllm_model(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[VllmModel]], TypeIs[VllmModel]]:
+    model: type[object] | object,
+) -> TypeIs[type[VllmModel]] | TypeIs[VllmModel]:
     return (
         _check_vllm_model_init(model)
         and _check_vllm_model_get_input_embeddings(model)
@@ -124,7 +122,7 @@ class VllmModelForTextGeneration(VllmModel[T], Protocol[T]):
     def compute_logits(
         self,
         hidden_states: T,
-    ) -> Optional[T]:
+    ) -> T | None:
         """Return `None` if TP rank > 0."""
         ...
 
@@ -140,10 +138,8 @@ def is_text_generation_model(model: object) -> TypeIs[VllmModelForTextGeneration
 
 
 def is_text_generation_model(
-    model: Union[type[object], object],
-) -> Union[
-    TypeIs[type[VllmModelForTextGeneration]], TypeIs[VllmModelForTextGeneration]
-]:
+    model: type[object] | object,
+) -> TypeIs[type[VllmModelForTextGeneration]] | TypeIs[VllmModelForTextGeneration]:
     if not is_vllm_model(model):
         return False
 
@@ -190,8 +186,8 @@ def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]: ...
 
 
 def is_pooling_model(
-    model: Union[type[object], object],
-) -> Union[TypeIs[type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]:
+    model: type[object] | object,
+) -> TypeIs[type[VllmModelForPooling]] | TypeIs[VllmModelForPooling]:
     if not is_vllm_model(model):
         return False
 
@@ -211,5 +207,5 @@ def default_pooling_type(pooling_type: str):
     return func
 
 
-def get_default_pooling_type(model: Union[type[object], object]) -> str:
+def get_default_pooling_type(model: type[object] | object) -> str:
     return getattr(model, "default_pooling_type", "LAST")
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 9435ff0d26cff..03918127c6ae1 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -9,7 +9,6 @@
 # --------------------------------------------------------
 from collections.abc import Iterable
 from functools import partial
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -121,8 +120,8 @@ class InternVisionPatchModel(nn.Module):
 
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
-        pixel_embeds: Optional[torch.Tensor] = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_embeds: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         if pixel_values is None and pixel_embeds is None:
             raise ValueError("You have to specify pixel_values or pixel_embeds")
@@ -144,7 +143,7 @@ class InternParallelAttention(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         num_dummy_heads: int = 0,
         prefix: str = "",
@@ -240,7 +239,7 @@ class InternMLP(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -277,7 +276,7 @@ class InternVisionEncoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         num_dummy_heads: int = 0,
         prefix: str = "",
@@ -312,7 +311,7 @@ class InternVisionEncoderLayer(nn.Module):
     def _init_attn(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         num_dummy_heads: int,
         prefix: str = "",
@@ -350,9 +349,9 @@ class InternVisionEncoder(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
         use_data_parallel: bool = False,
@@ -395,9 +394,9 @@ class InternVisionModel(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
         use_data_parallel: bool = False,
@@ -422,8 +421,8 @@ class InternVisionModel(nn.Module):
 
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
-        pixel_embeds: Optional[torch.Tensor] = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_embeds: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         if pixel_values is None and pixel_embeds is None:
             raise ValueError("You have to specify pixel_values or pixel_embeds")
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 128791541b3db..8d83a1478dff9 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable
 from functools import partial
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -54,7 +54,7 @@ class InternLM2MLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -92,10 +92,10 @@ class InternLM2Attention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -198,8 +198,8 @@ class InternLMDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -232,7 +232,7 @@ class InternLMDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -291,9 +291,9 @@ class InternLM2Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -359,8 +359,8 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
@@ -370,7 +370,7 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.output, hidden_states)
         return logits
 
@@ -451,9 +451,9 @@ class InternLM2ForRewardModel(InternLM2ForCausalLM):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 5344ded280b2a..6dc081e34157b 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -25,8 +24,8 @@ class InternLM2VEDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -66,8 +65,8 @@ class InternLM2VEDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-        visual_token_mask: Optional[torch.Tensor] = None,
+        residual: torch.Tensor | None,
+        visual_token_mask: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -107,10 +106,10 @@ class InternLM2VEModel(InternLM2Model):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        visual_token_mask: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        visual_token_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index 06c7c8ccd0b5e..38f052aba3187 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -7,7 +7,7 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import regex as re
 import torch
@@ -111,12 +111,10 @@ class InternS1ImageEmbeddingInputs(TensorSchema):
     """
 
     type: Literal["image_embeds"] = "image_embeds"
-    data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]], TensorShape("ni", "tifs", "hs")
-    ]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("ni", "tifs", "hs")]
 
 
-InternS1ImageInputs = Union[InternS1ImagePixelInputs, InternS1ImageEmbeddingInputs]
+InternS1ImageInputs: TypeAlias = InternS1ImagePixelInputs | InternS1ImageEmbeddingInputs
 
 
 class InternS1VideoPixelInputs(TensorSchema):
@@ -143,12 +141,10 @@ class InternS1VideoEmbeddingInputs(TensorSchema):
     """
 
     type: Literal["video_embeds"] = "video_embeds"
-    data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]], TensorShape("nv", "tvfs", "hs")
-    ]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("nv", "tvfs", "hs")]
 
 
-InternS1VideoInputs = Union[InternS1VideoPixelInputs, InternS1VideoEmbeddingInputs]
+InternS1VideoInputs: TypeAlias = InternS1VideoPixelInputs | InternS1VideoEmbeddingInputs
 
 
 def resolve_interns1_min_max_num(
@@ -190,7 +186,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
         )
         return hf_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": None}
 
     def get_num_image_tokens(
@@ -198,7 +194,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional["GotOcr2ImageProcessorFast"] = None,
+        processor: GotOcr2ImageProcessorFast | None = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor().image_processor
@@ -213,7 +209,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
         num_image_tokens = self.get_hf_processor().image_seq_length * num_image_patches
         return num_image_tokens
 
-    def resolve_target_ratios(self, use_thumbnail: Optional[bool] = None):
+    def resolve_target_ratios(self, use_thumbnail: bool | None = None):
         image_processor = self.get_hf_processor().image_processor
         min_dynamic_patch = image_processor.min_patches
         max_dynamic_patch = image_processor.max_patches
@@ -298,7 +294,7 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo])
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         target_width, target_height = self.info.get_image_size_with_most_features()
         target_num_frames = self.info.get_num_frames_with_most_features(
@@ -523,7 +519,7 @@ class InternS1ForConditionalGeneration(
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         # transformers InternVLProcessor uses <IMG_CONTEXT> as the separator
         # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116
         if modality.startswith("image"):
@@ -576,7 +572,7 @@ class InternS1ForConditionalGeneration(
     def _init_vision_model(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         prefix: str,
     ):
@@ -620,7 +616,7 @@ class InternS1ForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[InternS1ImageInputs]:
+    ) -> InternS1ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -654,7 +650,7 @@ class InternS1ForConditionalGeneration(
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[InternS1VideoInputs]:
+    ) -> InternS1VideoInputs | None:
         pixel_values_flat_video = kwargs.pop("pixel_values_videos", None)
         video_num_patches = kwargs.pop("video_num_patches", None)
         video_embeds = kwargs.pop("video_embeds", None)
@@ -688,7 +684,7 @@ class InternS1ForConditionalGeneration(
 
     def _process_vision_input(
         self,
-        image_input: Union[InternS1ImageInputs, InternS1VideoInputs],
+        image_input: InternS1ImageInputs | InternS1VideoInputs,
     ) -> tuple[torch.Tensor, ...]:
         if (
             image_input["type"] == "image_embeds"
@@ -763,9 +759,9 @@ class InternS1ForConditionalGeneration(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         if multimodal_embeddings is not None and len(multimodal_embeddings) > 0:
@@ -786,8 +782,8 @@ class InternS1ForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
         if intermediate_tensors is not None:
@@ -807,7 +803,7 @@ class InternS1ForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py
index f5965bdf7c9c7..cfc8b7e6084e2 100644
--- a/vllm/model_executor/models/interns1_vit.py
+++ b/vllm/model_executor/models/interns1_vit.py
@@ -8,7 +8,6 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -139,7 +138,7 @@ class InternS1VisionEmbeddings(nn.Module):
     def forward(
         self,
         pixel_values: torch.Tensor,
-        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        bool_masked_pos: torch.BoolTensor | None = None,
     ) -> torch.Tensor:
         _, _, height, width = pixel_values.shape
         embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
@@ -240,7 +239,7 @@ class InternS1VisionMLP(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -274,7 +273,7 @@ class InternS1VisionLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         num_dummy_heads: int = 0,
         prefix: str = "",
@@ -309,7 +308,7 @@ class InternS1VisionLayer(nn.Module):
     def _init_attn(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         num_dummy_heads: int,
         prefix: str = "",
@@ -337,9 +336,9 @@ class InternS1VisionEncoder(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
     ):
@@ -376,9 +375,9 @@ class InternS1VisionModel(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
     ) -> None:
@@ -404,8 +403,8 @@ class InternS1VisionModel(nn.Module):
 
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
-        pixel_embeds: Optional[torch.Tensor] = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_embeds: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         if pixel_values is None and pixel_embeds is None:
             raise ValueError("You have to specify pixel_values or pixel_embeds")
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 3cd3807dd8884..47429ef1b76e0 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -10,7 +10,7 @@
 import os
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, TypeVar, Union
+from typing import Annotated, Any, Literal, TypeAlias, TypeVar
 
 import numpy.typing as npt
 import torch
@@ -94,10 +94,10 @@ class InternVLImageEmbeddingInputs(TensorSchema):
     """
 
     type: Literal["image_embeds"]
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor]], TensorShape("n", "f", "h")]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("n", "f", "h")]
 
 
-InternVLImageInputs = Union[InternVLImagePixelInputs, InternVLImageEmbeddingInputs]
+InternVLImageInputs: TypeAlias = InternVLImagePixelInputs | InternVLImageEmbeddingInputs
 
 
 class InternVLVideoPixelInputs(TensorSchema):
@@ -124,10 +124,10 @@ class InternVLVideoEmbeddingInputs(TensorSchema):
     """
 
     type: Literal["video_embeds"]
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor]], TensorShape("n", "f", "h")]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("n", "f", "h")]
 
 
-InternVLVideoInputs = Union[InternVLVideoPixelInputs, InternVLVideoEmbeddingInputs]
+InternVLVideoInputs: TypeAlias = InternVLVideoPixelInputs | InternVLVideoEmbeddingInputs
 
 
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
@@ -349,9 +349,9 @@ class BaseInternVLProcessor(ABC):
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> None:
         super().__init__()
 
@@ -391,17 +391,17 @@ class BaseInternVLProcessor(ABC):
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         raise NotImplementedError
 
     def resolve_min_max_num(
         self,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_thumbnail: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
     ) -> tuple[int, int]:
         min_dynamic_patch = (
             self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
@@ -426,10 +426,10 @@ class BaseInternVLProcessor(ABC):
     def resolve_target_ratios(
         self,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_thumbnail: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
     ) -> list[tuple[int, int]]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -463,9 +463,9 @@ class BaseInternVLProcessor(ABC):
     def _images_to_pixel_values_lst(
         self,
         images: list[Image.Image],
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> list[torch.Tensor]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -489,9 +489,9 @@ class BaseInternVLProcessor(ABC):
         self,
         text: list[str],
         images: list[Image.Image],
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> tuple[list[str], dict[str, torch.Tensor]]:
         if len(images) == 0:
             image_inputs = {}
@@ -517,7 +517,7 @@ class BaseInternVLProcessor(ABC):
                 text = [t.replace("<image>", image_repl.full, 1) for t in text]
         return text, image_inputs
 
-    def _make_batch_input(self, input_item: Optional[Union[Any, list[Any]]] = None):
+    def _make_batch_input(self, input_item: Any | list[Any] | None = None):
         if input_item is None:
             input_item = []
         if not isinstance(input_item, list):
@@ -526,12 +526,12 @@ class BaseInternVLProcessor(ABC):
 
     def __call__(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        return_tensors: str | TensorType | None = None,
     ) -> BatchFeature:
         text, images = [self._make_batch_input(x) for x in (text, images)]
 
@@ -563,10 +563,10 @@ class InternVLProcessor(BaseInternVLProcessor):
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        video_token: Optional[str] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        video_token: str | None = None,
     ) -> None:
         super().__init__(
             config=config,
@@ -583,7 +583,7 @@ class InternVLProcessor(BaseInternVLProcessor):
         return self.tokenizer.get_vocab()[IMG_CONTEXT]
 
     @property
-    def video_token_id(self) -> Optional[int]:
+    def video_token_id(self) -> int | None:
         if self.video_token is None:
             return None
         return self.tokenizer.get_vocab().get(self.video_token, None)
@@ -595,7 +595,7 @@ class InternVLProcessor(BaseInternVLProcessor):
     def _videos_to_pixel_values_lst(
         self,
         videos: list[npt.NDArray],
-        dynamic_image_size: Optional[bool] = None,
+        dynamic_image_size: bool | None = None,
     ) -> list[torch.Tensor]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=1,
@@ -619,7 +619,7 @@ class InternVLProcessor(BaseInternVLProcessor):
         self,
         text: list[str],
         videos: list[npt.NDArray],
-        dynamic_image_size: Optional[bool] = None,
+        dynamic_image_size: bool | None = None,
     ):
         if len(videos) == 0 or not self.supports_video:
             video_inputs = {}
@@ -646,13 +646,13 @@ class InternVLProcessor(BaseInternVLProcessor):
 
     def __call__(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        videos: npt.NDArray | list[npt.NDArray] | None = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        return_tensors: str | TensorType | None = None,
     ) -> BatchFeature:
         text, images, videos = [
             self._make_batch_input(x) for x in (text, images, videos)
@@ -681,7 +681,7 @@ class InternVLProcessor(BaseInternVLProcessor):
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
@@ -691,7 +691,7 @@ class InternVLProcessor(BaseInternVLProcessor):
     def get_video_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int] = None,
+        num_patches: int | None = None,
         video_context_token: str = IMG_CONTEXT,
     ) -> PromptUpdateDetails[str]:
         repl_features = video_context_token * self.num_image_token
@@ -711,7 +711,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
         raise NotImplementedError
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -719,7 +719,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[BaseInternVLProcessor],
+        processor: BaseInternVLProcessor | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -779,7 +779,7 @@ class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         target_width, target_height = self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
@@ -901,7 +901,7 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
         video_limit = {"video": None} if self.supports_video else {}
         return {**super().get_supported_mm_limits(), **video_limit}
 
-    def get_video_token(self) -> Optional[str]:
+    def get_video_token(self) -> str | None:
         text_model_type = self.get_hf_config().get_text_config().model_type
         video_token_map = {
             "qwen2": "<|video_pad|>",
@@ -951,7 +951,7 @@ class InternVLDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         dummy_image = super().get_dummy_mm_data(
             seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options
@@ -1079,7 +1079,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA)
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
         if modality.startswith("video"):
@@ -1149,7 +1149,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA)
     def _init_vision_model(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         is_mono: bool,
         prefix: str,
@@ -1217,7 +1217,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA)
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[InternVLImageInputs]:
+    ) -> InternVLImageInputs | None:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -1250,7 +1250,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA)
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[InternVLVideoPixelInputs]:
+    ) -> InternVLVideoPixelInputs | None:
         pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None)
         video_num_patches = kwargs.pop("video_num_patches", None)
         video_embeds = kwargs.pop("image_embeds", None)
@@ -1283,7 +1283,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA)
 
     def _process_vision_input(
         self,
-        image_input: Union[InternVLImageInputs, InternVLVideoInputs],
+        image_input: InternVLImageInputs | InternVLVideoInputs,
     ) -> tuple[torch.Tensor, ...]:
         if (
             image_input["type"] == "image_embeds"
@@ -1364,9 +1364,9 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA)
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         if multimodal_embeddings is not None and len(multimodal_embeddings) > 0:
@@ -1387,8 +1387,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA)
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
         if intermediate_tensors is not None:
@@ -1413,7 +1413,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA)
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index d788ed7ec2af7..1daaed80b1440 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -24,7 +24,6 @@
 import math
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -86,8 +85,8 @@ class JAISAttention(nn.Module):
     def __init__(
         self,
         config: JAISConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -147,7 +146,7 @@ class JAISMLP(nn.Module):
         self,
         intermediate_size: int,
         config: JAISConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -194,8 +193,8 @@ class JAISBlock(nn.Module):
     def __init__(
         self,
         config: JAISConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -277,9 +276,9 @@ class JAISModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[IntermediateTensors, torch.Tensor]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> IntermediateTensors | torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
                 inputs_embeds = self.get_input_embeddings(input_ids)
@@ -341,9 +340,9 @@ class JAISLMHeadModel(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[IntermediateTensors, torch.Tensor]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> IntermediateTensors | torch.Tensor:
         hidden_states = self.transformer(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -352,7 +351,7 @@ class JAISLMHeadModel(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 0371458f55784..49cb9311a786d 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -4,7 +4,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional
 
 import torch
 from torch import nn
@@ -54,11 +53,11 @@ class JambaMoE(nn.Module):
     def __init__(
         self,
         config: JambaConfig,
-        num_experts: Optional[int] = None,
-        top_k: Optional[int] = None,
-        params_dtype: Optional[torch.dtype] = None,
-        tp_size: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        num_experts: int | None = None,
+        top_k: int | None = None,
+        params_dtype: torch.dtype | None = None,
+        tp_size: int | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -111,10 +110,10 @@ class JambaMambaDecoderLayer(nn.Module):
         self,
         config: JambaConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        is_lora_enabled: Optional[bool] = False,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        is_lora_enabled: bool | None = False,
         prefix: str = "",
         **kwargs,
     ) -> None:
@@ -159,7 +158,7 @@ class JambaMambaDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -181,9 +180,9 @@ class JambaAttentionDecoderLayer(nn.Module):
         self,
         config: JambaConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         **kwargs,
     ) -> None:
@@ -266,7 +265,7 @@ class JambaAttentionDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -348,8 +347,8 @@ class JambaModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -523,8 +522,8 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHyb
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ):
         hidden_states = self.model(
@@ -568,7 +567,7 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHyb
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py
index 9711eeeeec33e..a9333155243d4 100644
--- a/vllm/model_executor/models/jina_vl.py
+++ b/vllm/model_executor/models/jina_vl.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -105,14 +104,14 @@ class JinaVLForSequenceClassification(
         )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|image_pad|><|vision_end|>"
 
         raise ValueError("Only image modality is supported")
 
     @classmethod
-    def get_score_template(cls, query: str, document: str) -> Optional[str]:
+    def get_score_template(cls, query: str, document: str) -> str | None:
         return f"**Document**:\n{document}\n**Query**:\n{query}"
 
     @classmethod
@@ -124,8 +123,8 @@ class JinaVLForSequenceClassification(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> torch.Tensor:
         hidden_states = super().forward(
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 7ccbc81431f62..028162fdbf110 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -4,7 +4,7 @@ import math
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
-from typing import Annotated, Any, Literal, Optional, TypeVar, Union
+from typing import Annotated, Any, Literal, TypeAlias, TypeVar
 
 import numpy as np
 import torch
@@ -153,7 +153,7 @@ class KeyeImageEmbeddingInputs(TensorSchema):
     image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
-KeyeImageInputs = Union[KeyeImagePixelInputs, KeyeImageEmbeddingInputs]
+KeyeImageInputs: TypeAlias = KeyeImagePixelInputs | KeyeImageEmbeddingInputs
 
 
 class KeyeVideoPixelInputs(TensorSchema):
@@ -188,7 +188,7 @@ class KeyeVideoEmbeddingInputs(TensorSchema):
     video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
 
 
-KeyeVideoInputs = Union[KeyeVideoPixelInputs, KeyeVideoEmbeddingInputs]
+KeyeVideoInputs: TypeAlias = KeyeVideoPixelInputs | KeyeVideoEmbeddingInputs
 
 
 class KeyeVisionEmbeddings(nn.Module):
@@ -278,15 +278,9 @@ class KeyeVisionEmbeddings(nn.Module):
     def forward(
         self,
         pixel_values: torch.FloatTensor,
-        position_ids: Optional[torch.Tensor] = None,
-        image_grid_thw: Optional[
-            list[
-                Union[
-                    tuple[int, int, int],
-                    list[tuple[int, int, int]],
-                ]
-            ]
-        ] = None,
+        position_ids: torch.Tensor | None = None,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
         interpolate_pos_encoding=False,
     ) -> torch.Tensor:
         if pixel_values.dim() == 4:
@@ -357,7 +351,7 @@ class KeyeSiglipAttention(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -416,10 +410,10 @@ class KeyeSiglipAttention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        cu_seqlens: Optional[list[torch.Tensor]] = None,
-        rope_emb: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask: torch.Tensor | None = None,
+        output_attentions: bool | None = False,
+        cu_seqlens: list[torch.Tensor] | None = None,
+        rope_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split(
@@ -524,8 +518,8 @@ class SigLIPRotaryEmbedding(nn.Module):
 class KeyeSiglipEncoderLayer(nn.Module):
     def __init__(
         self,
-        config: Union[PretrainedConfig],
-        quant_config: Optional[QuantizationConfig] = None,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -547,9 +541,9 @@ class KeyeSiglipEncoderLayer(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        output_attentions: Optional[bool] = False,
-        cu_seqlens: Optional[list[torch.Tensor]] = None,
-        rope_emb: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        output_attentions: bool | None = False,
+        cu_seqlens: list[torch.Tensor] | None = None,
+        rope_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
     ) -> tuple[torch.FloatTensor]:
         residual = hidden_states
 
@@ -577,7 +571,7 @@ class KeyeSiglipEncoder(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -610,22 +604,16 @@ class KeyeSiglipEncoder(nn.Module):
     def forward(
         self,
         inputs_embeds,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        cu_seqlens: Optional[list[torch.Tensor]] = None,
-        image_grid_thw: Optional[
-            list[
-                Union[
-                    tuple[int, int, int],
-                    list[tuple[int, int, int]],
-                ]
-            ]
-        ] = None,
-        height_position_ids: Optional[torch.Tensor] = None,
-        width_position_ids: Optional[torch.Tensor] = None,
-        use_rope: Optional[bool] = False,
-        window_size: Optional[bool] = -1,
+        attention_mask: torch.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        cu_seqlens: list[torch.Tensor] | None = None,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
+        height_position_ids: torch.Tensor | None = None,
+        width_position_ids: torch.Tensor | None = None,
+        use_rope: bool | None = False,
+        window_size: bool | None = -1,
         vision_or_text: str = "vision",
     ) -> BaseModelOutput:
         device = inputs_embeds.device
@@ -676,7 +664,7 @@ class KeyeSiglipVisionTransformer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -694,29 +682,23 @@ class KeyeSiglipVisionTransformer(nn.Module):
     def forward(
         self,
         pixel_values,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        interpolate_pos_encoding: Optional[bool] = False,
-        attention_mask: Optional[torch.Tensor] = None,
-        sample_indices: Optional[torch.Tensor] = None,
-        image_indices: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        height_position_ids: Optional[torch.Tensor] = None,
-        width_position_ids: Optional[torch.Tensor] = None,
-        cu_seqlens: Optional[list[torch.Tensor]] = None,
-        padding_mask: Optional[torch.Tensor] = None,
-        vision_return_embed_list: Optional[bool] = False,
-        image_grid_thw: Optional[
-            list[
-                Union[
-                    tuple[int, int, int],
-                    list[tuple[int, int, int]],
-                ]
-            ]
-        ] = None,
-        return_pooler_output: Optional[bool] = True,
-        use_rope: Optional[bool] = False,
-        window_size: Optional[bool] = -1,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        interpolate_pos_encoding: bool | None = False,
+        attention_mask: torch.Tensor | None = None,
+        sample_indices: torch.Tensor | None = None,
+        image_indices: torch.Tensor | None = None,
+        position_ids: torch.Tensor | None = None,
+        height_position_ids: torch.Tensor | None = None,
+        width_position_ids: torch.Tensor | None = None,
+        cu_seqlens: list[torch.Tensor] | None = None,
+        padding_mask: torch.Tensor | None = None,
+        vision_return_embed_list: bool | None = False,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
+        return_pooler_output: bool | None = True,
+        use_rope: bool | None = False,
+        window_size: bool | None = -1,
     ) -> BaseModelOutputWithPooling:
         hidden_states = self.embeddings(
             pixel_values,
@@ -763,7 +745,7 @@ class KeyeSiglipVisionModel(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -789,24 +771,18 @@ class KeyeSiglipVisionModel(nn.Module):
     def forward(
         self,
         pixel_values,
-        sample_indices: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
+        sample_indices: torch.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
         interpolate_pos_encoding: bool = False,
-        position_ids: Optional[torch.Tensor] = None,
-        vision_return_embed_list: Optional[bool] = False,
-        image_grid_thw: Optional[
-            list[
-                Union[
-                    tuple[int, int, int],
-                    list[tuple[int, int, int]],
-                ]
-            ]
-        ] = None,
-        cu_seqlens: Optional[list[torch.Tensor]] = None,
-        return_pooler_output: Optional[bool] = True,
-        use_rope: Optional[bool] = False,
-        window_size: Optional[bool] = -1,
+        position_ids: torch.Tensor | None = None,
+        vision_return_embed_list: bool | None = False,
+        image_grid_thw: list[tuple[int, int, int] | list[tuple[int, int, int]]]
+        | None = None,
+        cu_seqlens: list[torch.Tensor] | None = None,
+        return_pooler_output: bool | None = True,
+        use_rope: bool | None = False,
+        window_size: bool | None = -1,
     ) -> BaseModelOutputWithPooling:
         return self.vision_model(
             pixel_values=pixel_values,
@@ -893,7 +869,7 @@ class Projector(nn.Module):
         self,
         text_config: PretrainedConfig,
         vision_config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -927,9 +903,9 @@ class Projector(nn.Module):
 
     def forward(
         self,
-        image_features: Union[torch.Tensor, list[torch.Tensor]],
+        image_features: torch.Tensor | list[torch.Tensor],
         image_grid_thw: list[tuple[int, int, int]],
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         m1, m2 = self.merge_kernel_size
         if isinstance(image_features, (list, tuple)):
             processed_features = list()
@@ -988,7 +964,7 @@ def _keye_field_config(
 class KeyeMultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
@@ -1005,7 +981,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
 
     def _parse_video_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+        data: dict[str, torch.Tensor] | ModalityData[VideoItem],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
@@ -1033,7 +1009,7 @@ class KeyeProcessingInfo(BaseProcessingInfo):
 
     def get_supported_mm_limits(
         self,
-    ) -> Mapping[str, Optional[int]]:
+    ) -> Mapping[str, int | None]:
         return {"image": None, "video": None}
 
     def get_mm_max_tokens_per_item(
@@ -1200,7 +1176,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -1303,7 +1279,7 @@ class BaseKeyeModule(nn.Module):
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|image_pad|><|vision_end|>"
         if modality.startswith("video"):
@@ -1348,7 +1324,7 @@ class BaseKeyeModule(nn.Module):
         self,
         text_config: PretrainedConfig,
         vision_config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         raise ValueError("Need projector")
@@ -1403,8 +1379,8 @@ class BaseKeyeModule(nn.Module):
         self,
         video_type: Literal["video_embeds", "pixel_values_videos"],
         video_grid_thw: list[torch.Tensor],
-        pixel_values_videos: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        pixel_values_videos: torch.Tensor | None = None,
+    ) -> torch.Tensor | list[torch.Tensor]:
         siglip_position_ids = list()
         video_grid_hws = list()
         sample_indices = list()
@@ -1473,7 +1449,7 @@ class BaseKeyeModule(nn.Module):
 
     def get_multimodal_embeddings(
         self, **kwargs: object
-    ) -> Optional[MultiModalEmbeddings]:
+    ) -> MultiModalEmbeddings | None:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return None
@@ -1495,10 +1471,10 @@ class BaseKeyeModule(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for Keye-VL.
 
         Args:
@@ -1527,7 +1503,7 @@ class BaseKeyeModule(nn.Module):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
@@ -1555,14 +1531,14 @@ class KeyeForConditionalGeneration(
         self,
         text_config: PretrainedConfig,
         vision_config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         return Projector(text_config, vision_config, quant_config, prefix)
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[KeyeImageInputs]:
+    ) -> KeyeImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -1586,7 +1562,7 @@ class KeyeForConditionalGeneration(
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[KeyeVideoInputs]:
+    ) -> KeyeVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py
index 21d8099b43d16..9a9a46995af9e 100644
--- a/vllm/model_executor/models/keye_vl1_5.py
+++ b/vllm/model_executor/models/keye_vl1_5.py
@@ -3,7 +3,7 @@
 import itertools
 from collections.abc import Mapping, Sequence
 from functools import partial
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import numpy as np
 import torch
@@ -73,7 +73,7 @@ def split_thw(grid_thw: torch.Tensor) -> torch.Tensor:
 
 
 def get_num_patches(
-    grid_thw: torch.Tensor, num_frames: Union[list[int], torch.Tensor]
+    grid_thw: torch.Tensor, num_frames: list[int] | torch.Tensor
 ) -> list[int]:
     """
     Return num_patches per video.
@@ -153,7 +153,9 @@ class KeyeVL1_5ImageEmbeddingInputs(TensorSchema):
     image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
-KeyeVL1_5ImageInputs = Union[KeyeVL1_5ImagePixelInputs, KeyeVL1_5ImageEmbeddingInputs]
+KeyeVL1_5ImageInputs: TypeAlias = (
+    KeyeVL1_5ImagePixelInputs | KeyeVL1_5ImageEmbeddingInputs
+)
 
 
 class KeyeVL1_5VideoPixelInputs(TensorSchema):
@@ -191,7 +193,9 @@ class KeyeVL1_5VideoEmbeddingInputs(TensorSchema):
     num_frames: torch.Tensor
 
 
-KeyeVL1_5VideoInputs = Union[KeyeVL1_5VideoPixelInputs, KeyeVL1_5VideoEmbeddingInputs]
+KeyeVL1_5VideoInputs: TypeAlias = (
+    KeyeVL1_5VideoPixelInputs | KeyeVL1_5VideoEmbeddingInputs
+)
 
 
 class KeyeVL1_5Projector(nn.Module):
@@ -199,7 +203,7 @@ class KeyeVL1_5Projector(nn.Module):
         self,
         text_config: PretrainedConfig,
         vision_config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -233,9 +237,9 @@ class KeyeVL1_5Projector(nn.Module):
 
     def forward(
         self,
-        image_features: Union[torch.Tensor, tuple[torch.Tensor], list[torch.Tensor]],
+        image_features: torch.Tensor | tuple[torch.Tensor] | list[torch.Tensor],
         image_grid_thw: list[tuple[int, int, int]],
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         m1, m2 = self.merge_kernel_size
         if isinstance(image_features, (list, tuple)):
             processed_features = list()
@@ -275,7 +279,7 @@ class KeyeVL1_5ProcessingInfo(KeyeProcessingInfo):
 
     def get_supported_mm_limits(
         self,
-    ) -> Mapping[str, Optional[int]]:
+    ) -> Mapping[str, int | None]:
         return {"image": None, "video": 1}
 
 
@@ -327,7 +331,7 @@ def _keye_field_config(
 class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
@@ -344,7 +348,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
 
     def _parse_video_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+        data: dict[str, torch.Tensor] | ModalityData[VideoItem],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
@@ -499,7 +503,7 @@ class KeyeVL1_5ForConditionalGeneration(
         self,
         text_config: PretrainedConfig,
         vision_config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         return KeyeVL1_5Projector(text_config, vision_config, quant_config, prefix)
@@ -511,7 +515,7 @@ class KeyeVL1_5ForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[KeyeVL1_5ImageInputs]:
+    ) -> KeyeVL1_5ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -535,7 +539,7 @@ class KeyeVL1_5ForConditionalGeneration(
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[KeyeVL1_5VideoInputs]:
+    ) -> KeyeVL1_5VideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -595,19 +599,19 @@ class KeyeVL1_5ForConditionalGeneration(
         cls,
         input_tokens: list[int],
         hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        image_grid_thw: list[list[int]] | torch.Tensor,
+        video_grid_thw: list[list[int]] | torch.Tensor,
         context_len: int = 0,
-        seq_len: Optional[int] = None,
-        second_per_grid_ts: Optional[list[float]] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
+        seq_len: int | None = None,
+        second_per_grid_ts: list[float] | None = None,
+        audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
         if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
             video_grid_thw = video_grid_thw[0]
         """Get mrope input positions and delta value (Keye series)."""
 
-        def split_thw(grid_thw: Union[torch.Tensor, list[int]]) -> list[list[int]]:
+        def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
             """
             Split grid_thw along the t dimension.
 
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index f7381e6b6b93e..c2630fa6ac2b6 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -46,7 +46,7 @@ import copy
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal
 
 import torch
 from torch import nn
@@ -153,7 +153,7 @@ class KimiVLImagePixelInputs(TensorSchema):
     type: Literal["pixel_values"] = "pixel_values"
 
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("np", 3, "ps", "ps"),
     ]
 
@@ -169,7 +169,7 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(KimiVLConfig)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -227,7 +227,7 @@ class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
@@ -305,7 +305,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>"
 
@@ -370,7 +370,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[KimiVLImageInputs]:
+    ) -> KimiVLImageInputs | None:
         # image input type must be pixel values now
         pixel_values = kwargs.pop("pixel_values", None)
         image_grid_hws = kwargs.pop("image_grid_hws", None)
@@ -411,7 +411,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     def get_language_model(self) -> torch.nn.Module:
         return self.language_model
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(self, **kwargs: object) -> NestedTensors | None:
         # Validate the multimodal input keyword arguments
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
@@ -425,8 +425,8 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
         if intermediate_tensors is not None:
@@ -570,7 +570,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
 def get_spec_layer_idx_from_weight_name(
     config: DeepseekV2Config, weight_name: str
-) -> Optional[int]:
+) -> int | None:
     if hasattr(config, "num_nextn_predict_layers") and (
         config.num_nextn_predict_layers > 0
     ):
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index 425c936877602..5684b9a891257 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -54,8 +54,8 @@ class Lfm2MLP(nn.Module):
         ff_dim: int,
         multiple_of: int,
         auto_adjust_ff_dim: bool,
-        ffn_dim_multiplier: Optional[float],
-        quant_config: Optional[QuantizationConfig] = None,
+        ffn_dim_multiplier: float | None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -98,10 +98,10 @@ class Lfm2Attention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -190,9 +190,9 @@ class Lfm2AttentionDecoderLayer(nn.Module):
         self,
         config: Lfm2Config,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -240,7 +240,7 @@ class Lfm2AttentionDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
@@ -258,9 +258,9 @@ class Lfm2ShortConvDecoderLayer(nn.Module):
         self,
         config: Lfm2Config,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -289,7 +289,7 @@ class Lfm2ShortConvDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -365,8 +365,8 @@ class Lfm2Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -532,8 +532,8 @@ class Lfm2ForCausalLM(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ) -> torch.Tensor:
         hidden_states = self.model(
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index f7903a7af53fe..bb7926a9cfa9d 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -65,7 +65,7 @@ class Lfm2MoeMlp(nn.Module):
         self,
         dim: int,
         ff_dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -96,7 +96,7 @@ class Lfm2MoeSparseMoeBlock(nn.Module):
     def __init__(
         self,
         config: Lfm2MoeConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ):
@@ -191,10 +191,10 @@ class Lfm2MoeAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -283,9 +283,9 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module):
         self,
         config: Lfm2MoeConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ) -> None:
@@ -340,7 +340,7 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
@@ -358,9 +358,9 @@ class Lfm2MoeShortConvDecoderLayer(nn.Module):
         self,
         config: Lfm2MoeConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ) -> None:
@@ -396,7 +396,7 @@ class Lfm2MoeShortConvDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -479,8 +479,8 @@ class Lfm2MoeModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -774,8 +774,8 @@ class Lfm2MoeForCausalLM(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ) -> torch.Tensor:
         hidden_states = self.model(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 948c9280f953a..7cc908e52c887 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -26,7 +26,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -76,7 +76,7 @@ class LlamaMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
         reduce_results: bool = True,
@@ -121,12 +121,12 @@ class LlamaAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         bias_o_proj: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -236,8 +236,8 @@ class LlamaAttention(nn.Module):
     def _init_rotary_emb(
         self,
         config: LlamaConfig,
-        rope_scaling: Optional[dict[str, Any]],
-        quant_config: Optional[QuantizationConfig],
+        rope_scaling: dict[str, Any] | None,
+        quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
         is_gguf = quant_config and quant_config.get_name() == "gguf"
@@ -260,7 +260,7 @@ class LlamaDecoderLayer(nn.Module):
         self,
         vllm_config: VllmConfig,
         prefix: str = "",
-        config: Optional[LlamaConfig] = None,
+        config: LlamaConfig | None = None,
     ) -> None:
         super().__init__()
 
@@ -331,7 +331,7 @@ class LlamaDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -346,7 +346,7 @@ class LlamaDecoderLayer(nn.Module):
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
-    def get_quant_config(self, vllm_config: VllmConfig) -> Optional[QuantizationConfig]:
+    def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
         """Get quantization config for this layer. Override in subclasses."""
         return vllm_config.quant_config
 
@@ -407,13 +407,11 @@ class LlamaModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[
-        torch.Tensor, IntermediateTensors, tuple[torch.Tensor, list[torch.Tensor]]
-    ]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -627,9 +625,9 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         model_output = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -638,7 +636,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index df7bd9b7f6d1b..33badb13fc9fb 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -19,7 +19,7 @@
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -148,12 +148,12 @@ class Llama4Attention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         bias_o_proj: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -296,7 +296,7 @@ class Llama4DecoderLayer(nn.Module):
         self,
         vllm_config: VllmConfig,
         prefix: str = "",
-        config: Optional[Llama4TextConfig] = None,
+        config: Llama4TextConfig | None = None,
     ) -> None:
         super().__init__()
 
@@ -352,7 +352,7 @@ class Llama4DecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index 039022ef4527f..dd6337244ca68 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -49,7 +48,7 @@ class LlamaModel(nn.Module):
         vllm_config: VllmConfig,
         prefix: str = "",
         start_layer_id: int = 0,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.config = vllm_config.speculative_config.draft_model_config.hf_config
@@ -81,10 +80,10 @@ class LlamaModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings(input_ids)
@@ -136,7 +135,7 @@ class LlamaModel(nn.Module):
         return loaded_params
 
     def validate_and_update_config(
-        self, start_layer_id: int, quant_config: Optional[QuantizationConfig] = None
+        self, start_layer_id: int, quant_config: QuantizationConfig | None = None
     ) -> None:
         # yoco and moe is not supported by draft model yet
         assert self.config.yoco_global_kv_layer is None
@@ -193,7 +192,7 @@ class EagleLlama4ForCausalLM(Llama4ForCausalLM):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         return self.model(input_ids, positions, hidden_states, inputs_embeds)
 
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 5df158818c9fb..8f4ba88677734 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -28,7 +27,7 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
         vllm_config: VllmConfig,
         disable_input_layernorm: bool,
         prefix: str = "",
-        config: Optional[LlamaConfig] = None,
+        config: LlamaConfig | None = None,
     ) -> None:
         super().__init__(vllm_config, prefix=prefix, config=config)
 
@@ -155,7 +154,7 @@ class EagleLlamaForCausalLM(LlamaForCausalLM):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if inputs_embeds is not None:
             raise NotImplementedError(
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 67d4669899193..da4bbda186b17 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -35,7 +34,7 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
         self,
         vllm_config: VllmConfig,
         prefix: str = "",
-        config: Optional[LlamaConfig] = None,
+        config: LlamaConfig | None = None,
         layer_idx: int = 0,
     ) -> None:
         super().__init__(vllm_config, prefix=prefix, config=config)
@@ -66,7 +65,7 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
         else:
             self._residual_norm = self._norm_after_residual
 
-    def get_quant_config(self, vllm_config: VllmConfig) -> Optional[QuantizationConfig]:
+    def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
         """Use drafter's quantization config instead of verifier's."""
         draft_model_config = vllm_config.speculative_config.draft_model_config
         draft_load_config = vllm_config.load_config
@@ -96,7 +95,7 @@ class LlamaDecoderLayer(LlamaDecoderLayer):
         positions: torch.Tensor,
         embeds: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if self.layer_idx == 0:
             # First layer: concatenate embeds with hidden_states
@@ -182,7 +181,7 @@ class LlamaModel(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        input_embeds: Optional[torch.Tensor] = None,
+        input_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if input_embeds is None:
             input_embeds = self.get_input_embeddings(input_ids)
@@ -268,8 +267,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
-        is_multimodal: Optional[torch.Tensor] = None,
+        multimodal_embeddings: NestedTensors | None = None,
+        is_multimodal: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
@@ -278,14 +277,14 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         return self.model(input_ids, positions, hidden_states, inputs_embeds)
 
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         if self.draft_id_to_target_id is None:
             assert logits.shape[1] == self.config.vocab_size, (
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 3d46e22a0d217..a3dea0ce86f8e 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -3,7 +3,7 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Final, Literal, Optional, Protocol, TypeVar, Union
+from typing import Annotated, Final, Literal, Protocol, TypeAlias, TypeVar
 
 import torch
 import torch.nn as nn
@@ -93,7 +93,7 @@ class PixtralHFImagePixelInputs(TensorSchema):
 
     type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral"
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "c", "h", "w", dynamic_dims={"h", "w"}),
     ]
 
@@ -110,9 +110,9 @@ class LlavaImageEmbeddingInputs(TensorSchema):
     data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
-LlavaImageInputs = Union[
-    LlavaImagePixelInputs, PixtralHFImagePixelInputs, LlavaImageEmbeddingInputs
-]
+LlavaImageInputs: TypeAlias = (
+    LlavaImagePixelInputs | PixtralHFImagePixelInputs | LlavaImageEmbeddingInputs
+)
 
 
 class LlavaMultiModalProjector(nn.Module):
@@ -122,7 +122,7 @@ class LlavaMultiModalProjector(nn.Module):
         text_hidden_size: int,
         projector_hidden_act: str,
         multimodal_projector_bias: bool,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -154,7 +154,7 @@ class LlavaLikeConfig(Protocol):
     vision_config: Final[PretrainedConfig]
     image_token_index: Final[int]
     vision_feature_select_strategy: Final[str]
-    vision_feature_layer: Final[Union[int, list[int]]]
+    vision_feature_layer: Final[int | list[int]]
 
 
 class LlavaLikeProcessor(Protocol):
@@ -172,7 +172,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
         raise NotImplementedError
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -222,7 +222,7 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
@@ -406,7 +406,7 @@ def _build_llava_or_pixtral_hf_processor(
     info: _I,
     dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
-    cache: Optional[BaseMultiModalProcessorCache] = None,
+    cache: BaseMultiModalProcessorCache | None = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, PixtralHFProcessingInfo):
         return PixtralHFMultiModalProcessor(
@@ -461,11 +461,11 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
 
 def init_vision_tower_for_llava(
     hf_config: LlavaLikeConfig,
-    quant_config: Optional[QuantizationConfig],
+    quant_config: QuantizationConfig | None,
     *,
-    require_post_norm: Optional[bool] = None,
+    require_post_norm: bool | None = None,
     prefix: str = "",
-) -> Union[CLIPVisionModel, SiglipVisionModel, PixtralHFVisionModel]:
+) -> CLIPVisionModel | SiglipVisionModel | PixtralHFVisionModel:
     vision_config = hf_config.vision_config
 
     # Initialize the vision tower only up to the deepest required feature layer
@@ -524,7 +524,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -585,7 +585,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[LlavaImageInputs]:
+    ) -> LlavaImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -619,9 +619,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _image_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel, PixtralHFVisionModel],
-        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        vision_tower: CLIPVisionModel | SiglipVisionModel | PixtralHFVisionModel,
+        pixel_values: torch.Tensor | list[torch.Tensor],
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
         return vision_tower(
@@ -631,8 +631,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _process_image_pixels(
         self,
-        inputs: Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs],
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        inputs: LlavaImagePixelInputs | PixtralHFImagePixelInputs,
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["pixel_values"]
@@ -642,7 +642,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     def _process_image_input(
         self,
         image_input: LlavaImageInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -672,10 +672,10 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for LLaVA-1.5.
 
         One key thing to understand is the `input_ids` already accounts for the
@@ -725,7 +725,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
@@ -754,11 +754,11 @@ class MantisProcessingInfo(LlavaProcessingInfo):
 class MantisMultiModalProcessor(LlavaMultiModalProcessor):
     def apply(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        tokenization_kwargs: Mapping[str, object] | None = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index caedace7cab1e..3cf546644d04a 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -3,7 +3,7 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping
-from typing import Annotated, Final, Literal, Optional, Protocol, TypeVar, Union
+from typing import Annotated, Final, Literal, Protocol, TypeAlias, TypeVar
 
 import torch
 import torch.nn as nn
@@ -55,11 +55,11 @@ class LlavaNextImagePixelInputs(TensorSchema):
 
     type: Literal["pixel_values"] = "pixel_values"
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np"}),
     ]
 
-    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
+    image_sizes: Annotated[torch.Tensor | None, TensorShape("bn", 2)]
     # This should be in `(height, width)` format.
 
 
@@ -75,7 +75,9 @@ class LlavaNextImageEmbeddingInputs(TensorSchema):
     data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
-LlavaNextImageInputs = Union[LlavaNextImagePixelInputs, LlavaNextImageEmbeddingInputs]
+LlavaNextImageInputs: TypeAlias = (
+    LlavaNextImagePixelInputs | LlavaNextImageEmbeddingInputs
+)
 
 
 class LlavaNextLikeConfig(LlavaLikeConfig, Protocol):
@@ -235,7 +237,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -294,7 +296,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[LlavaNextImageInputs]:
+    ) -> LlavaNextImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -324,7 +326,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
 
     def _image_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        vision_tower: CLIPVisionModel | SiglipVisionModel,
         pixel_values: torch.Tensor,
     ) -> torch.Tensor:
         # NOTE: we skip the step to select the vision feature layer since
@@ -424,7 +426,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     def _process_image_pixels(
         self,
         inputs: LlavaNextImagePixelInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["pixel_values"]
@@ -456,7 +458,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     def _process_image_input(
         self,
         image_input: LlavaNextImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
             return [image_input["data"]]
 
@@ -491,9 +493,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         # Multi-modal token ID may exceed vocab size
         handle_oov_mm_token: bool = True,
     ) -> torch.Tensor:
@@ -512,10 +514,10 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for LlaVA-NeXT.
 
         One key thing to understand is the `input_ids` already accounts for the
@@ -573,7 +575,7 @@ model_executor.models.llava_next.LlavaNextProcessingInfo.get_num_image_tokens].
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 074acc7943a43..17732f8a54902 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -3,7 +3,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
@@ -66,7 +66,7 @@ class LlavaNextVideoPixelInputs(TensorSchema):
     type: Literal["pixel_values_videos"] = "pixel_values_videos"
 
     pixel_values_videos: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "f", 3, "h", "w", dynamic_dims={"f"}),
     ]
 
@@ -81,7 +81,7 @@ class LlavaNextVideoProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(LlavaNextVideoProcessor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"video": 1}
 
     def get_image_size_with_most_features(self) -> ImageSize:
@@ -165,7 +165,7 @@ class LlavaNextVideoDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_videos = mm_counts.get("video", 0)
 
@@ -313,7 +313,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
         if modality.startswith("video"):
@@ -356,7 +356,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[LlavaNextVideoPixelInputs]:
+    ) -> LlavaNextVideoPixelInputs | None:
         """
         A legal video input should have the following dimensions:
         {
@@ -381,7 +381,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
 
     def _video_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        vision_tower: CLIPVisionModel | SiglipVisionModel,
         pixel_values: torch.Tensor,
     ) -> torch.Tensor:
         # NOTE: we skip the step to select the vision feature layer since
@@ -433,10 +433,10 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for LlaVA-NeXT-Video.
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
@@ -455,7 +455,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 05f1621694c36..c9a27728eb735 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -3,7 +3,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Final, Literal, Optional, Protocol, Union
+from typing import Annotated, Final, Literal, Protocol, TypeAlias
 
 import torch
 import torch.nn as nn
@@ -69,7 +69,7 @@ class LlavaOnevisionVideoPixelInputs(TensorSchema):
     type: Literal["pixel_values_videos"] = "pixel_values_videos"
 
     pixel_values_videos: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "f", 3, "h", "w", dynamic_dims={"f"}),
     ]
 
@@ -90,11 +90,11 @@ class LlavaOnevisionImagePixelInputs(TensorSchema):
     type: Literal["pixel_values"] = "pixel_values"
 
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np"}),
     ]
 
-    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
+    image_sizes: Annotated[torch.Tensor | None, TensorShape("bn", 2)]
 
 
 class LlavaOnevisionImageEmbeddingInputs(TensorSchema):
@@ -113,13 +113,13 @@ class LlavaOnevisionImageEmbeddingInputs(TensorSchema):
     ]
 
 
-LlavaOnevisionImageInputs = Union[
-    LlavaOnevisionImagePixelInputs, LlavaOnevisionImageEmbeddingInputs
-]
+LlavaOnevisionImageInputs: TypeAlias = (
+    LlavaOnevisionImagePixelInputs | LlavaOnevisionImageEmbeddingInputs
+)
 
-LlavaOnevisionMultiInputs = Union[
-    LlavaOnevisionImageInputs, LlavaOnevisionVideoPixelInputs
-]
+LlavaOnevisionMultiInputs: TypeAlias = (
+    LlavaOnevisionImageInputs | LlavaOnevisionVideoPixelInputs
+)
 
 
 class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol):
@@ -133,7 +133,7 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(LlavaOnevisionProcessor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": None}
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
@@ -276,7 +276,7 @@ class LlavaOnevisionDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -493,7 +493,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
         if modality.startswith("video"):
@@ -531,7 +531,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[LlavaOnevisionImageInputs]:
+    ) -> LlavaOnevisionImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -560,7 +560,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[LlavaOnevisionVideoPixelInputs]:
+    ) -> LlavaOnevisionVideoPixelInputs | None:
         """
         A legal video input should have the following dimensions:
         {
@@ -606,7 +606,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
 
     def _image_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        vision_tower: CLIPVisionModel | SiglipVisionModel,
         pixel_values: torch.Tensor,
     ) -> torch.Tensor:
         # NOTE: we skip the step to select the vision feature layer since
@@ -726,7 +726,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
     def _process_image_pixels(
         self,
         inputs: LlavaOnevisionImagePixelInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["pixel_values"]
@@ -761,7 +761,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
     def _process_image_input(
         self,
         image_input: LlavaOnevisionImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
             return [image_input["data"]]
 
@@ -788,7 +788,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
 
     def _video_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
+        vision_tower: CLIPVisionModel | SiglipVisionModel,
         pixel_values: torch.Tensor,
     ) -> torch.Tensor:
         # NOTE: we skip the step to select the vision feature layer since
@@ -893,10 +893,10 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for LlaVA-Onevision.
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
@@ -915,7 +915,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index 17ec6b7d2b06a..5d26e1c38eed4 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -36,7 +36,6 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -194,7 +193,7 @@ class FlashMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ) -> None:
@@ -270,8 +269,8 @@ class LongcatMoe(nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ):
@@ -329,8 +328,8 @@ class FlashDecoderLayer(nn.Module):
         self,
         vllm_config: VllmConfig,
         config: FlashConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ) -> None:
@@ -415,7 +414,7 @@ class FlashDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
             residual = hidden_states
@@ -506,9 +505,9 @@ class FlashModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -592,9 +591,9 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -603,7 +602,7 @@ class LongcatFlashForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/longcat_flash_mtp.py b/vllm/model_executor/models/longcat_flash_mtp.py
index 55468f354c3a2..e554d1e2de927 100644
--- a/vllm/model_executor/models/longcat_flash_mtp.py
+++ b/vllm/model_executor/models/longcat_flash_mtp.py
@@ -4,7 +4,6 @@
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/model_executor/models/deepseek_mtp.py
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -35,7 +34,7 @@ class LongCatMultiTokenPredictorLayer(nn.Module):
         config: PretrainedConfig,
         prefix: str,
         vllm_config: VllmConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -55,7 +54,7 @@ class LongCatMultiTokenPredictorLayer(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_index: int = 0,
     ) -> torch.Tensor:
         assert inputs_embeds is not None
@@ -78,7 +77,7 @@ class LongCatMultiTokenPredictor(nn.Module):
         self,
         *,
         vllm_config: VllmConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -110,7 +109,7 @@ class LongCatMultiTokenPredictor(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         if inputs_embeds is None:
@@ -155,8 +154,8 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         hidden_states = self.model(
@@ -168,7 +167,7 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
         self,
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
@@ -344,7 +343,7 @@ class LongCatFlashMTP(nn.Module, SupportsPP):
 
     def get_spec_layer_idx_from_weight_name(
         self, config: PretrainedConfig, weight_name: str
-    ) -> Optional[int]:
+    ) -> int | None:
         if "model.mtp" in weight_name:
             return config.num_hidden_layers * 2
         return None
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 1638aab137aaf..fb145289fbfe9 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -4,7 +4,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional
 
 import torch
 from torch import nn
@@ -49,10 +48,10 @@ class MambaDecoderLayer(nn.Module):
     def __init__(
         self,
         config: MambaConfig,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        is_lora_enabled: Optional[bool] = False,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        is_lora_enabled: bool | None = False,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -83,7 +82,7 @@ class MambaDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -149,8 +148,8 @@ class MambaModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -244,8 +243,8 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ):
         hidden_states = self.backbone(
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 4491648f3a0ad..5eb21b966e187 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -3,7 +3,6 @@
 """PyTorch MAMBA2 model."""
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
@@ -44,9 +43,9 @@ class Mamba2DecoderLayer(nn.Module):
     def __init__(
         self,
         config: MambaConfig,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -76,7 +75,7 @@ class Mamba2DecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -142,8 +141,8 @@ class Mamba2Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -277,8 +276,8 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ):
         hidden_states = self.backbone(
diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py
index 47839a2c6b03f..2a798672d13c6 100644
--- a/vllm/model_executor/models/midashenglm.py
+++ b/vllm/model_executor/models/midashenglm.py
@@ -25,8 +25,8 @@
 
 import collections
 import collections.abc
-from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Callable, Optional, TypedDict, Union, cast
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from typing import Any, TypeAlias, TypedDict, cast
 
 import numpy as np
 import torch
@@ -66,7 +66,7 @@ from vllm.transformers_utils.configs.midashenglm import DashengConfig
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
 
-_Tuple2 = Union[int, tuple[int, int], Sequence[int]]
+_Tuple2: TypeAlias = int | tuple[int, int] | Sequence[int]
 
 
 def _resolve_tuple2(x: _Tuple2) -> tuple[int, int]:
@@ -105,7 +105,7 @@ class AudioPatchEmbed(nn.Module):
         patch_stride: _Tuple2 = 16,
         in_chans: int = 1,
         embed_dim: int = 768,
-        norm_layer: Optional[Callable] = None,
+        norm_layer: Callable | None = None,
         flatten: bool = False,
     ):
         super().__init__()
@@ -151,9 +151,9 @@ class DashengMlp(nn.Module):
     def __init__(
         self,
         in_features: int,
-        hidden_features: Optional[int] = None,
-        out_features: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        hidden_features: int | None = None,
+        out_features: int | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -186,7 +186,7 @@ class DashengAttention(nn.Module):
         dim: int,
         num_heads: int = 8,
         qkv_bias: bool = False,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -226,7 +226,7 @@ class DashengAttention(nn.Module):
             prefix=f"{prefix}.proj",
         )
 
-    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None):
+    def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None):
         B, N, C = x.shape
 
         qkv, _ = self.qkv(x)
@@ -253,8 +253,8 @@ class DashengBlock(nn.Module):
         num_heads: int,
         mlp_ratio: float = 4.0,
         qkv_bias: bool = False,
-        init_values: Optional[float] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        init_values: float | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -285,7 +285,7 @@ class DashengBlock(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
+        mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         x = x + self.ls1(self.attn(self.norm1(x), mask))
         x = x + self.ls2(self.mlp(self.norm2(x)))
@@ -349,7 +349,7 @@ class DashengAudioTransformer(nn.Module):
     def __init__(
         self,
         config: DashengConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -393,7 +393,7 @@ class DashengAudioTransformer(nn.Module):
     def forward_features(
         self,
         x: torch.Tensor,
-        mask: Optional[torch.Tensor] = None,
+        mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         t = x.shape[-1]
         x = x + self.time_pos_embed[:, :, :, :t]
@@ -418,8 +418,8 @@ class DashengAudioTransformer(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        x_length: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        x_length: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         x = self.front_end(x)
         x = x.to(self.time_pos_embed.dtype)
         target_length_in_patches = self.target_length // 4
@@ -462,8 +462,8 @@ class AudioProjectorSubsample(nn.Module):
         in_dim: int,
         out_dim: int,
         downsample_rate=5,
-        dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -524,7 +524,7 @@ class MiDashengLMProcessingInfo(BaseProcessingInfo):
         feature_extractor = hf_processor.feature_extractor
         return feature_extractor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None}
 
     def get_min_audio_len(self):
@@ -550,7 +550,7 @@ class MiDashengLMDummyInputsBuilder(BaseDummyInputsBuilder[MiDashengLMProcessing
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
 
@@ -689,7 +689,7 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("audio"):
             return "<|audio_bos|><|AUDIO|><|audio_eos|>"
 
@@ -750,7 +750,7 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _parse_and_validate_audio_input(
         self, **kwargs: object
-    ) -> Optional[MiDashengLMAudioInputs]:
+    ) -> MiDashengLMAudioInputs | None:
         input_values = kwargs.pop("input_values", None)
         audio_length = kwargs.pop("audio_length", None)
 
@@ -820,10 +820,10 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
@@ -845,7 +845,7 @@ class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.decoder.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py
index e01e064218420..726752a77e0dc 100644
--- a/vllm/model_executor/models/mimo.py
+++ b/vllm/model_executor/models/mimo.py
@@ -28,7 +28,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -64,9 +63,9 @@ class MiMoModel(Qwen2Model):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -185,7 +184,7 @@ class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         hidden_states = self.model.norm(hidden_states)
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py
index b678a06b7f20f..3d7695a2a3042 100644
--- a/vllm/model_executor/models/mimo_mtp.py
+++ b/vllm/model_executor/models/mimo_mtp.py
@@ -21,7 +21,6 @@
 """Inference-only MiMo-MTP model."""
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -48,8 +47,8 @@ class MiMoMultiTokenPredictorLayer(nn.Module):
         config: PretrainedConfig,
         prefix: str,
         model_config: ModelConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
 
@@ -129,7 +128,7 @@ class MiMoMultiTokenPredictor(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         if inputs_embeds is None:
@@ -173,8 +172,8 @@ class MiMoMTP(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         assert spec_step_idx == 0, "mimo_mtp only support predict one token now"
@@ -187,7 +186,7 @@ class MiMoMTP(nn.Module):
         self,
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.model.compute_logits(hidden_states, self.lm_head, spec_step_idx)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 06cb6bc615767..09328b4722488 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -27,7 +27,7 @@
 import math
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -89,8 +89,8 @@ class MiniCPMMoE(nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        tp_size: Optional[int] = None,
+        params_dtype: torch.dtype | None = None,
+        tp_size: int | None = None,
     ):
         super().__init__()
         self.tp_size = tp_size or get_tensor_model_parallel_world_size()
@@ -190,7 +190,7 @@ class MiniCPMMLP(nn.Module):
         intermediate_size: int,
         hidden_act: str,
         hidden_act_param: float,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -223,10 +223,10 @@ class MiniCPMAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -305,8 +305,8 @@ class MiniCPMDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -362,7 +362,7 @@ class MiniCPMDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         residual = hidden_states
@@ -425,8 +425,8 @@ class MiniCPMModel(nn.Module):
         self,
         prefix: str,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig],
-        quant_config: Optional[QuantizationConfig],
+        cache_config: CacheConfig | None,
+        quant_config: QuantizationConfig | None,
     ):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
@@ -444,11 +444,9 @@ class MiniCPMModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[
-        torch.Tensor, IntermediateTensors, tuple[torch.Tensor, list[torch.Tensor]]
-    ]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -633,11 +631,9 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[
-        torch.Tensor, IntermediateTensors, tuple[torch.Tensor, list[torch.Tensor]]
-    ]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
         model_output = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -658,7 +654,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 35f02a1538e87..ab4fe36476b92 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -25,7 +25,7 @@
 # limitations under the License.
 """Inference-only MiniCPM3 model compatible with HuggingFace weights."""
 
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -63,10 +63,10 @@ class MiniCPM3Attention(nn.Module):
         q_lora_rank: int,
         kv_lora_rank: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -214,8 +214,8 @@ class MiniCPM3Model(MiniCPMModel):
         self,
         prefix: str,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig],
-        quant_config: Optional[QuantizationConfig],
+        cache_config: CacheConfig | None,
+        quant_config: QuantizationConfig | None,
     ):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index 6c635b2481093..463af9bbe1399 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -26,7 +26,6 @@
 
 import math
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -61,8 +60,8 @@ class EagleMiniCPMDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -118,7 +117,7 @@ class EagleMiniCPMDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         residual = hidden_states
@@ -185,8 +184,8 @@ class EagleMiniCPMModel(nn.Module):
         self,
         prefix: str,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig],
-        quant_config: Optional[QuantizationConfig],
+        cache_config: CacheConfig | None,
+        quant_config: QuantizationConfig | None,
         start_layer: int,
     ):
         self.eagle_layers = nn.ModuleList(
@@ -210,7 +209,7 @@ class EagleMiniCPMModel(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         input_embeds = self.get_input_embeddings(input_ids)
         input_embeds = self.input_norm1(input_embeds)
         hidden_states = self.input_norm2(hidden_states)
@@ -389,7 +388,7 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index 34f05122abe3a..827c019008ab8 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -24,8 +24,8 @@
 # limitations under the License.
 """Inference-only MiniCPM-O model compatible with HuggingFace weights."""
 
-from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 from torch import nn
@@ -89,7 +89,7 @@ class MiniCPMOAudioFeatureInputs(TensorSchema):
     type: Literal["audio_features"] = "audio_features"
 
     audio_features: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bns", "c", "l", dynamic_dims={"l"}),
     ]
     """
@@ -99,7 +99,7 @@ class MiniCPMOAudioFeatureInputs(TensorSchema):
     """
 
     audio_feature_lens: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "s"),
     ]
     """
@@ -121,12 +121,14 @@ class MiniCPMOAudioEmbeddingInputs(TensorSchema):
     type: Literal["audio_embeds"] = "audio_embeds"
 
     audio_embeds: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "s", "h", dynamic_dims={"s"}),
     ]
 
 
-MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs, MiniCPMOAudioEmbeddingInputs]
+MiniCPMOAudioInputs: TypeAlias = (
+    MiniCPMOAudioFeatureInputs | MiniCPMOAudioEmbeddingInputs
+)
 
 
 def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
@@ -162,8 +164,8 @@ class MiniCPMOAudioEmbeddingItems(DictEmbeddingItems):
 class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser):
     def _parse_audio_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[AudioItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[AudioItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return MiniCPMOAudioEmbeddingItems(
                 data,
@@ -176,7 +178,7 @@ class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser):
 class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
     audio_pattern = "(<audio>./</audio>)"
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {**super().get_supported_mm_limits(), "audio": None}
 
     def get_audio_placeholder(
@@ -253,7 +255,7 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
         audio_len = (
@@ -479,7 +481,7 @@ class MiniCPMWhisperEncoder(WhisperEncoder):
     def forward(
         self,
         input_features: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: torch.Tensor | None = None,
     ) -> BaseModelOutputWithPast:
         # Ignore copy
         input_features = input_features.to(
@@ -549,7 +551,7 @@ class MiniCPMO(MiniCPMV2_6):
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "(<image>./</image>)"
         if modality.startswith("video"):
@@ -722,7 +724,7 @@ class MiniCPMO(MiniCPMV2_6):
 
     def _parse_and_validate_audio_input(
         self, **kwargs: object
-    ) -> Optional[MiniCPMOAudioInputs]:
+    ) -> MiniCPMOAudioInputs | None:
         audio_features = kwargs.pop("audio_features", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
 
@@ -785,7 +787,7 @@ class MiniCPMO(MiniCPMV2_6):
     def _process_audio_input(
         self,
         audio_input: MiniCPMOAudioInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         if audio_input["type"] == "audio_embeds":
             return audio_input["audio_embeds"]
 
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 09f973e98db99..53a25cf988481 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -26,10 +26,10 @@
 
 import math
 from collections import defaultdict
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from itertools import chain
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import numpy as np
 import torch
@@ -140,12 +140,12 @@ class MiniCPMVImageEmbeddingInputs(TensorSchema):
 
     type: Literal["image_embeds"]
     image_embeds: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "ns", "hs"),
     ]
 
 
-MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs, MiniCPMVImageEmbeddingInputs]
+MiniCPMVImageInputs: TypeAlias = MiniCPMVImagePixelInputs | MiniCPMVImageEmbeddingInputs
 
 DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
 
@@ -156,10 +156,10 @@ class Resampler2_5(BaseResampler):
         num_queries: int,
         embed_dim: int,
         num_heads: int,
-        kv_dim: Optional[int] = None,
+        kv_dim: int | None = None,
         norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
         max_size: tuple[int, int] = (70, 70),
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__(
@@ -251,11 +251,11 @@ class Resampler4_5(Resampler2_5):
         num_queries: int,
         embed_dim: int,
         num_heads: int,
-        kv_dim: Optional[int] = None,
+        kv_dim: int | None = None,
         norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
         max_size: tuple[int, int] = (70, 70),
         max_temporal_size: int = 36000,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__(
@@ -318,7 +318,7 @@ class Resampler4_5(Resampler2_5):
             self.max_temporal_size = max_temporal_size
             self._set_temporal_pos_cache(self.max_temporal_size, device)
 
-    def _init_weights(self, m: Union[nn.Linear, nn.LayerNorm]):
+    def _init_weights(self, m: nn.Linear | nn.LayerNorm):
         if isinstance(m, nn.Linear):
             trunc_normal_(m.weight, std=0.02)
             if isinstance(m, nn.Linear) and m.bias is not None:
@@ -521,8 +521,8 @@ class MiniCPMVVideoEmbeddingItems(DictEmbeddingItems):
 class MiniCPMVMultiModalDataParser(MultiModalDataParser):
     def _parse_image_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return MiniCPMVImageEmbeddingItems(
                 data,
@@ -533,8 +533,8 @@ class MiniCPMVMultiModalDataParser(MultiModalDataParser):
 
     def _parse_video_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[VideoItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return MiniCPMVVideoEmbeddingItems(
                 data,
@@ -570,7 +570,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
     def get_model_version(self):
         return get_version_by_config(self.get_hf_config())
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         mm_limits = {"image": None}
         if self.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
             mm_limits["video"] = None
@@ -582,7 +582,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         image_size: ImageSize,
         # For MiniCPM V/O 2.6
         image_idx: int = 0,
-        max_slice_nums: Optional[int] = None,
+        max_slice_nums: int | None = None,
         use_image_id: bool = True,
     ) -> str:
         image_processor = self.get_image_processor()
@@ -602,8 +602,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
         self,
         image_size: ImageSize,
         # For MiniCPM V/O 2.6
-        max_slice_nums: Optional[int] = None,
-    ) -> Optional[tuple[int, int]]:
+        max_slice_nums: int | None = None,
+    ) -> tuple[int, int] | None:
         image_processor = self.get_image_processor()
         version = self.get_model_version()
 
@@ -621,7 +621,7 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
     def get_num_image_tokens(
         self,
         image_size: ImageSize,
-        max_slice_nums: Optional[int] = None,
+        max_slice_nums: int | None = None,
     ) -> int:
         image_processor = self.get_image_processor()
 
@@ -712,7 +712,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -1024,7 +1024,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "(<image>./</image>)"
         if modality.startswith("video"):
@@ -1073,7 +1073,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         self,
         modality: str,
         **kwargs: object,
-    ) -> Optional[MiniCPMVImageInputs]:
+    ) -> MiniCPMVImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -1158,7 +1158,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     def _process_vision_input(
         self,
         image_input: MiniCPMVImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | list[torch.Tensor] | tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["image_embeds"]
 
@@ -1200,8 +1200,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: Any,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
@@ -1218,7 +1218,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.llm.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
@@ -1243,7 +1243,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
     ) -> nn.Module:
         raise NotImplementedError
@@ -1252,7 +1252,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
         self,
         embed_dim: int,
         vision_dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         raise NotImplementedError
@@ -1278,7 +1278,7 @@ class MiniCPMV2_0(MiniCPMVBaseModel):
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
     ) -> nn.Module:
         # TODO: refactor vision model through timm wrapper from transformers
@@ -1313,7 +1313,7 @@ class MiniCPMV2_0(MiniCPMVBaseModel):
         self,
         embed_dim: int,
         vision_dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         with set_default_torch_dtype(torch.float16):
@@ -1381,7 +1381,7 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
     ) -> nn.Module:
         model = Idefics2VisionTransformer(
@@ -1398,7 +1398,7 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
         self,
         embed_dim: int,
         vision_dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         with set_default_torch_dtype(torch.float16):
@@ -1474,7 +1474,7 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         model = Idefics2VisionTransformer(
@@ -1491,7 +1491,7 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
         self,
         embed_dim: int,
         vision_dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         with set_default_torch_dtype(torch.float16):
@@ -1577,7 +1577,7 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         quant_config = self._maybe_ignore_quant_config(quant_config)
@@ -1595,7 +1595,7 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         self,
         embed_dim: int,
         vision_dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         quant_config = self._maybe_ignore_quant_config(quant_config)
@@ -1682,7 +1682,7 @@ class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
     def init_vision_module(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         quant_config = self._maybe_ignore_quant_config(quant_config)
@@ -1700,7 +1700,7 @@ class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
         self,
         embed_dim: int,
         vision_dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         quant_config = self._maybe_ignore_quant_config(quant_config)
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index e6e0952f71dd6..82f7cd3aa8c22 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -4,7 +4,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     pass
@@ -83,7 +83,7 @@ class MiniMaxText01MLP(nn.Module):
         self,
         hidden_size: int,
         intermediate_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         layer_idx: int = None,
         prefix: str = "mlp",
     ) -> None:
@@ -121,9 +121,9 @@ class MiniMaxText01MoE(nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
+        params_dtype: torch.dtype | None = None,
         layer_idx: int = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "moe",
     ) -> None:
         super().__init__()
@@ -191,10 +191,10 @@ class MiniMaxText01Attention(nn.Module):
         rotary_dim: int,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        sliding_window: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        sliding_window: int | None = None,
+        quant_config: QuantizationConfig | None = None,
         layer_idx: int = None,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "mha",
     ) -> None:
         super().__init__()
@@ -273,12 +273,12 @@ class MiniMaxText01DecoderLayer(nn.Module):
     def __init__(
         self,
         config: MiniMaxConfig,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         expert_num: int = 1,
         layer_id: int = None,
-        linear_layer_id: Optional[int] = None,
+        linear_layer_id: int | None = None,
         prefix: str = "decoder",
     ) -> None:
         self._ilayer = layer_id
@@ -428,7 +428,7 @@ class MiniMaxText01DecoderLayer(nn.Module):
         hidden_states: torch.Tensor,
         positions: torch.Tensor,
         attn_metadata: AttentionMetadata,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         is_warmup: bool = False,
         **kwargs,
     ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -627,12 +627,12 @@ class MiniMaxText01Model(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         forward_context = get_forward_context()
         attn_metadata = forward_context.attn_metadata
 
@@ -722,8 +722,8 @@ class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ) -> torch.Tensor:
         hidden_states = self.model(
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index a25a7097a6ece..fb7c6d42a0658 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
@@ -52,11 +52,11 @@ class MiniMaxVL01ImagePixelInputs(TensorSchema):
 
     type: Literal["pixel_values"] = "pixel_values"
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np", "h", "w"}),
     ]
 
-    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
+    image_sizes: Annotated[torch.Tensor | None, TensorShape("bn", 2)]
     # This should be in `(height, width)` format.
 
 
@@ -72,9 +72,9 @@ class MiniMaxVL01ImageEmbeddingInputs(TensorSchema):
     data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
-MiniMaxVL01ImageInputs = Union[
-    MiniMaxVL01ImagePixelInputs, MiniMaxVL01ImageEmbeddingInputs
-]
+MiniMaxVL01ImageInputs: TypeAlias = (
+    MiniMaxVL01ImagePixelInputs | MiniMaxVL01ImageEmbeddingInputs
+)
 
 
 class MiniMaxVL01MultiModalProjector(nn.Module):
@@ -84,7 +84,7 @@ class MiniMaxVL01MultiModalProjector(nn.Module):
         text_hidden_size: int,
         projector_hidden_act: str,
         multimodal_projector_bias: bool,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -127,7 +127,7 @@ class MiniMaxVL01ProcessingInfo(LlavaNextProcessingInfo):
 
         return hf_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
 
@@ -187,7 +187,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -239,9 +239,9 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support
 
     def _image_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel, PixtralHFVisionModel],
-        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        vision_tower: CLIPVisionModel | SiglipVisionModel | PixtralHFVisionModel,
+        pixel_values: torch.Tensor | list[torch.Tensor],
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
         feature_select_strategy = self.config.vision_feature_select_strategy
@@ -302,7 +302,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support
     def _process_image_pixels(
         self,
         inputs: MiniMaxVL01ImagePixelInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["pixel_values"]
@@ -311,7 +311,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support
     def _process_image_input(
         self,
         image_input: MiniMaxVL01ImageInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -330,7 +330,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[MiniMaxVL01ImageInputs]:
+    ) -> MiniMaxVL01ImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -364,10 +364,10 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
@@ -388,7 +388,7 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 8e74425c5dbdd..26d4deca2e120 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -3,7 +3,7 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Final, Literal, Optional, Protocol, TypeVar, Union
+from typing import Annotated, Final, Literal, Protocol, TypeVar
 
 import torch
 import torch.nn as nn
@@ -72,7 +72,7 @@ class Mistral3ImagePixelInputs(TensorSchema):
     # Note that `height` or `width` may be different per batch and image,
     # in which case the data is passed as a list instead of a batched tensor.
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", 3, "h", "w", dynamic_dims={"h", "w"}),
     ]
 
@@ -136,7 +136,7 @@ class Mistral3MultiModalProjector(nn.Module):
         patch_size: int,
         projector_hidden_act: str,
         multimodal_projector_bias: bool,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -179,7 +179,7 @@ class LlavaLikeConfig(Protocol):
     vision_config: Final[PretrainedConfig]
     image_token_index: Final[int]
     vision_feature_select_strategy: Final[str]
-    vision_feature_layer: Final[Union[int, list[int]]]
+    vision_feature_layer: Final[int | list[int]]
 
 
 class LlavaLikeProcessor(Protocol):
@@ -197,7 +197,7 @@ class BaseLlavaProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
         raise NotImplementedError
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -234,7 +234,7 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
@@ -348,7 +348,7 @@ def _build_mistral3_processor(
     info: _I,
     dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
-    cache: Optional[BaseMultiModalProcessorCache] = None,
+    cache: BaseMultiModalProcessorCache | None = None,
 ) -> BaseMultiModalProcessor:
     assert isinstance(info, Mistral3ProcessingInfo)
     return Mistral3MultiModalProcessor(
@@ -394,9 +394,9 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
 
 def init_vision_tower_for_llava(
     hf_config: LlavaLikeConfig,
-    quant_config: Optional[QuantizationConfig],
+    quant_config: QuantizationConfig | None,
     *,
-    require_post_norm: Optional[bool] = None,
+    require_post_norm: bool | None = None,
     prefix: str = "",
 ) -> PixtralHFVisionModel:
     vision_config = hf_config.vision_config
@@ -441,7 +441,7 @@ class Mistral3ForConditionalGeneration(
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -504,7 +504,7 @@ class Mistral3ForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Mistral3ImagePixelInputs]:
+    ) -> Mistral3ImagePixelInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -519,7 +519,7 @@ class Mistral3ForConditionalGeneration(
     def _process_image_input(
         self,
         image_input: Mistral3ImagePixelInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -562,10 +562,10 @@ class Mistral3ForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for Mistral3.
 
         One key thing to understand is the `input_ids` already accounts for the
@@ -615,7 +615,7 @@ class Mistral3ForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 37b49349ec12c..bc56481820a92 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -27,7 +27,6 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -88,10 +87,10 @@ class MixtralMoE(nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        tp_size: Optional[int] = None,
-        dp_size: Optional[int] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
+        dp_size: int | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ):
@@ -163,8 +162,8 @@ class MixtralAttention(nn.Module):
         num_kv_heads: int,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -242,8 +241,8 @@ class MixtralDecoderLayer(nn.Module):
     def __init__(
         self,
         config: MixtralConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         enable_eplb: bool = False,
     ) -> None:
@@ -280,7 +279,7 @@ class MixtralDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
@@ -353,9 +352,9 @@ class MixtralModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -615,9 +614,9 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -626,7 +625,7 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index b624a6200ab3d..8da45f681043a 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -19,7 +19,7 @@
 import math
 from collections.abc import Iterable, Mapping
 from itertools import tee
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 from torch import nn
@@ -115,7 +115,7 @@ class Llama4VisionMLP(nn.Module):
         output_size: int,
         bias: bool,
         output_activation: bool,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -152,7 +152,7 @@ class Llama4MultiModalProjector(nn.Module):
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -199,7 +199,7 @@ class Llama4VisionPixelShuffleMLP(nn.Module):
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -229,7 +229,7 @@ class Llama4VisionAttention(nn.Module):
     def __init__(
         self,
         config: Llama4VisionConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -323,7 +323,7 @@ class Llama4VisionEncoderLayer(nn.Module):
     def __init__(
         self,
         config: Llama4VisionConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -376,7 +376,7 @@ class Llama4VisionEncoder(nn.Module):
     def __init__(
         self,
         config: Llama4VisionConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -419,7 +419,7 @@ class Llama4UnfoldConvolution(nn.Module):
     def __init__(
         self,
         config: Llama4VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -449,7 +449,7 @@ class Llama4VisionModel(nn.Module):
     def __init__(
         self,
         config: Llama4VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -547,7 +547,7 @@ class Mllama4ProcessingInfo(BaseProcessingInfo):
             Llama4Processor, use_fast=kwargs.pop("use_fast", True), **kwargs
         )
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         # Although vLLM can support more images from an infra capability
         # perspective, we do not recommend using >10 images in practice.
         return {"image": None}
@@ -699,7 +699,7 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
@@ -733,7 +733,7 @@ class Llama4ForConditionalGeneration(
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|image|>"
 
@@ -792,7 +792,7 @@ class Llama4ForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Llama4ImagePatchInputs]:
+    ) -> Llama4ImagePatchInputs | None:
         # num_images, 1, num_chunks, channel, image_size, image_size
         pixel_values = kwargs.pop("pixel_values", None)
         if pixel_values is None:
@@ -849,10 +849,10 @@ class Llama4ForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -863,7 +863,7 @@ class Llama4ForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def separate_weights(
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 58e2acb8ce922..ff9f6a41ab994 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Set
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -50,7 +49,7 @@ class ModernBertEmbeddings(nn.Module):
     def forward(
         self,
         input_ids: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:
             return self.norm(inputs_embeds)
@@ -74,7 +73,7 @@ class ModernBertRotaryEmbedding(RotaryEmbedding):
 
 
 class ModernBertAttention(nn.Module):
-    def __init__(self, config: ModernBertConfig, layer_id: Optional[int] = None):
+    def __init__(self, config: ModernBertConfig, layer_id: int | None = None):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -151,7 +150,7 @@ class ModernBertMLP(nn.Module):
 
 class ModernBertLayer(nn.Module):
     def __init__(
-        self, config: ModernBertConfig, prefix: str = "", layer_id: Optional[int] = None
+        self, config: ModernBertConfig, prefix: str = "", layer_id: int | None = None
     ):
         super().__init__()
         self.config = config
@@ -243,8 +242,8 @@ class ModernBertModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
@@ -287,9 +286,9 @@ class ModernBertPooler(Pooler):
 
     def forward(
         self,
-        hidden_states: Union[torch.Tensor, list[torch.Tensor]],
+        hidden_states: torch.Tensor | list[torch.Tensor],
         pooling_metadata: PoolingMetadata,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         pooled_output = self.pooling(hidden_states, pooling_metadata)
 
         if isinstance(pooled_output, list):
@@ -370,10 +369,10 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding):
 
     def forward(
         self,
-        input_ids: Optional[torch.LongTensor],
+        input_ids: torch.LongTensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         return self.model(
             input_ids=input_ids,
@@ -436,10 +435,10 @@ class ModernBertForTokenClassification(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         hidden_states = self.model(
             input_ids=input_ids,
diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py
index 666796d835a36..9e7d997bdb01d 100644
--- a/vllm/model_executor/models/module_mapping.py
+++ b/vllm/model_executor/models/module_mapping.py
@@ -5,7 +5,6 @@
 #  https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py
 
 from dataclasses import dataclass, field
-from typing import Union
 
 
 @dataclass
@@ -55,10 +54,10 @@ class MultiModelKeys(ModelKeys):
 
     @staticmethod
     def from_string_field(
-        language_model: Union[str, list[str]] = None,
-        connector: Union[str, list[str]] = None,
-        tower_model: Union[str, list[str]] = None,
-        generator: Union[str, list[str]] = None,
+        language_model: str | list[str] = None,
+        connector: str | list[str] = None,
+        tower_model: str | list[str] = None,
+        generator: str | list[str] = None,
         **kwargs,
     ) -> "MultiModelKeys":
         def to_list(value):
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index f1dd06f3a0650..83e0f282ddf8d 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -6,7 +6,7 @@ from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
 from functools import cached_property, partial
 from itertools import islice
-from typing import Annotated, Optional, Union
+from typing import Annotated
 
 import numpy as np
 import torch
@@ -104,18 +104,18 @@ class MolmoImageInputs(TensorSchema):
     """
 
     images: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "nc", "np", "pd", dynamic_dims={"nc"}),
     ]
     # Number of crops may vary per batch and image, so pass it as a list.
 
     image_masks: Annotated[
-        Optional[Union[torch.Tensor, list[torch.Tensor]]],
+        torch.Tensor | list[torch.Tensor] | None,
         TensorShape("bn", "nc", "np", dynamic_dims={"nc"}),
     ]
 
     image_input_idx: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "nc", "tp", dynamic_dims={"nc"}),
     ]
     # An index tensor that maps image features to their corresponding patch tokens.
@@ -151,7 +151,7 @@ class ViTMLP(nn.Module):
     def __init__(
         self,
         config: VisionBackboneConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         self.w1 = ColumnParallelLinear(
@@ -185,7 +185,7 @@ class MultiHeadDotProductAttention(nn.Module):
         config: VisionBackboneConfig,
         use_bias: bool = True,
         nlayers: int = 1,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
 
@@ -238,7 +238,7 @@ class MultiHeadDotProductAttention(nn.Module):
         )
 
     def forward(
-        self, inputs_q: torch.Tensor, inputs_kv: Optional[torch.Tensor] = None
+        self, inputs_q: torch.Tensor, inputs_kv: torch.Tensor | None = None
     ) -> torch.Tensor:
         if inputs_kv is not None:
             inputs_k = inputs_kv
@@ -263,7 +263,7 @@ class ResidualAttentionBlock(nn.Module):
     def __init__(
         self,
         config: VisionBackboneConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         self.attention = MultiHeadDotProductAttention(config, quant_config=quant_config)
@@ -289,7 +289,7 @@ class BlockCollection(nn.Module):
     def __init__(
         self,
         config: VisionBackboneConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         self.resblocks = nn.ModuleList(
@@ -317,7 +317,7 @@ class VisionTransformer(nn.Module):
     def __init__(
         self,
         config: VisionBackboneConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         scale = config.image_emb_dim**-0.5
@@ -367,7 +367,7 @@ class VisionTransformer(nn.Module):
         return x
 
     def forward(
-        self, x: torch.Tensor, patch_num: Optional[int] = None
+        self, x: torch.Tensor, patch_num: int | None = None
     ) -> list[torch.Tensor]:
         """
         : param x: (batch_size, num_patch, n_pixels)
@@ -396,8 +396,8 @@ class MolmoAttention(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -432,9 +432,9 @@ class MolmoAttention(nn.Module):
             quant_config=quant_config,
         )
 
-        self.tp_rank: Optional[int] = None
-        self.k_norm: Optional[nn.Module] = None
-        self.q_norm: Optional[nn.Module] = None
+        self.tp_rank: int | None = None
+        self.k_norm: nn.Module | None = None
+        self.q_norm: nn.Module | None = None
         if config.attention_layer_norm:
             self.tp_rank = get_tensor_model_parallel_rank()
             self.k_norm = RMSNorm(
@@ -503,8 +503,8 @@ class LanguageModelMLP(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        input_dim: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        input_dim: int | None = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -542,8 +542,8 @@ class ImageProjectorMLP(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        input_dim: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        input_dim: int | None = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -580,8 +580,8 @@ class MolmoDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -604,8 +604,8 @@ class MolmoDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
         # Self Attention
         if residual is None:
             residual = hidden_states
@@ -627,8 +627,8 @@ class MolmoDecoderNormAfterLayer(MolmoDecoderLayer):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
         # Self Attention
         residual = hidden_states
         hidden_states = self.self_attn(
@@ -654,7 +654,7 @@ class MolmoVisionBackbone(nn.Module, SupportsQuant):
         self,
         config: PretrainedConfig,
         vision_config: VisionBackboneConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.vit_layers = VIT_LAYERS
@@ -849,8 +849,8 @@ class MolmoModel(nn.Module, SupportsQuant):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -1064,7 +1064,7 @@ class MolmoProcessorWrapper:
         return image_token_length_h
 
     @property
-    def message_format(self) -> Optional[str]:
+    def message_format(self) -> str | None:
         return "role"
 
     @property
@@ -1145,9 +1145,9 @@ class MolmoProcessorWrapper:
 
     def __call__(
         self,
-        text: Optional[Union[TextInput, list[TextInput]]] = None,
-        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: TextInput | list[TextInput] | None = None,
+        images: ImageInput | list[ImageInput] | None = None,
+        return_tensors: str | TensorType | None = None,
         **kwargs,
     ) -> BatchFeature:
         outputs = self.processor.process(  # type: ignore
@@ -1189,7 +1189,7 @@ class MolmoProcessingInfo(BaseProcessingInfo):
         processor = self.ctx.get_hf_processor(**kwargs)
         return MolmoProcessorWrapper(processor)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -1197,7 +1197,7 @@ class MolmoProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[MolmoProcessorWrapper],
+        processor: MolmoProcessorWrapper | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -1250,7 +1250,7 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         target_width, target_height = self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
@@ -1398,7 +1398,7 @@ class MolmoForCausalLM(
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -1442,7 +1442,7 @@ class MolmoForCausalLM(
     def _parse_and_validate_image_input(
         self,
         **kwargs: object,
-    ) -> Optional[MolmoImageInputs]:
+    ) -> MolmoImageInputs | None:
         images = kwargs.pop("images", None)
         image_masks = kwargs.pop("image_masks", None)
         image_input_idx = kwargs.pop("image_input_idx", None)
@@ -1522,8 +1522,8 @@ class MolmoForCausalLM(
         self,
         input_ids: torch.LongTensor,
         positions: torch.LongTensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index 3bf8fce0de0d4..96ec6e6b56acb 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -45,7 +45,6 @@
 from collections.abc import Sequence
 from copy import deepcopy
 from functools import cached_property
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -68,8 +67,8 @@ def multihead_attention(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    q_cu_seqlens: Optional[torch.Tensor] = None,
-    k_cu_seqlens: Optional[torch.Tensor] = None,
+    q_cu_seqlens: torch.Tensor | None = None,
+    k_cu_seqlens: torch.Tensor | None = None,
 ) -> torch.Tensor:
     """Multi-head attention using flash attention 2.
 
@@ -121,8 +120,8 @@ def sdpa_attention(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    q_cu_seqlens: Optional[torch.Tensor] = None,
-    k_cu_seqlens: Optional[torch.Tensor] = None,
+    q_cu_seqlens: torch.Tensor | None = None,
+    k_cu_seqlens: torch.Tensor | None = None,
 ) -> torch.Tensor:
     """SDPA attention.
 
@@ -230,7 +229,7 @@ class MoonVisionPatchEmbed(nn.Module):
         self,
         out_dim: int,
         in_dim: int = 3,
-        patch_size: Union[int, tuple[int, int]] = (14, 14),
+        patch_size: int | tuple[int, int] = (14, 14),
         pos_emb_height: int = 14,
         pos_emb_width: int = 14,
     ):
@@ -460,7 +459,7 @@ class MoonVitEncoderLayer(nn.Module):
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rope_freqs_cis: Optional[torch.Tensor] = None,
+        rope_freqs_cis: torch.Tensor | None = None,
     ):
         """
         Args:
@@ -491,7 +490,7 @@ class MoonVitEncoderLayer(nn.Module):
         self,
         hidden_states: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rope_freqs_cis: Union[torch.Tensor, None] = None,
+        rope_freqs_cis: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """
         Args:
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 3f1f2bbcb0267..936dbf6c3243e 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -5,7 +5,6 @@
 import math
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -58,8 +57,8 @@ class MPTAttention(nn.Module):
     def __init__(
         self,
         config: MptConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -152,7 +151,7 @@ class MPTMLP(nn.Module):
     def __init__(
         self,
         config: MptConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         hidden_size = config.d_model
@@ -183,8 +182,8 @@ class MPTBlock(nn.Module):
     def __init__(
         self,
         config: MptConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -251,9 +250,9 @@ class MPTModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -311,9 +310,9 @@ class MPTForCausalLM(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.transformer(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -322,7 +321,7 @@ class MPTForCausalLM(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 91dfa67355341..56d3c4bb7d107 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -11,7 +11,7 @@ import copy
 import warnings
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, TypedDict, TypeVar, Union
+from typing import Annotated, Any, Literal, TypeAlias, TypedDict, TypeVar
 
 import numpy.typing as npt
 import torch
@@ -110,7 +110,7 @@ class NanoNemotronVLImagePixelInputs(TypedDict):
 
 class NanoNemotronVLImageEmbeddinInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: Union[torch.Tensor, list[torch.Tensor]]
+    data: torch.Tensor | list[torch.Tensor]
     """ 
     A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
     or a list of tensors of shape `(total_image_feature_size, hidden_size)`
@@ -119,9 +119,9 @@ class NanoNemotronVLImageEmbeddinInputs(TypedDict):
     """
 
 
-NanoNemotronVLImageInputs = Union[
-    NanoNemotronVLImagePixelInputs, NanoNemotronVLImageEmbeddinInputs
-]
+NanoNemotronVLImageInputs: TypeAlias = (
+    NanoNemotronVLImagePixelInputs | NanoNemotronVLImageEmbeddinInputs
+)
 
 
 class NanoNemotronVLVideoPixelInputs(TensorSchema):
@@ -148,12 +148,12 @@ class NanoNemotronVLVideoEmbeddingInputs(TensorSchema):
     """
 
     type: Literal["video_embeds"]
-    data: Annotated[Union[torch.Tensor, list[torch.Tensor]], TensorShape("n", "f", "h")]
+    data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("n", "f", "h")]
 
 
-NanoNemotronVLVideoInputs = Union[
-    NanoNemotronVLVideoPixelInputs, NanoNemotronVLVideoEmbeddingInputs
-]
+NanoNemotronVLVideoInputs: TypeAlias = (
+    NanoNemotronVLVideoPixelInputs | NanoNemotronVLVideoEmbeddingInputs
+)
 
 
 def dynamic_preprocess(
@@ -262,7 +262,7 @@ class BaseNanoNemotronVLProcessor(ABC):
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *args,
-        max_num_tiles: Optional[int] = None,
+        max_num_tiles: int | None = None,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -291,7 +291,7 @@ class BaseNanoNemotronVLProcessor(ABC):
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         raise NotImplementedError
 
@@ -354,7 +354,7 @@ class BaseNanoNemotronVLProcessor(ABC):
                 text = [t.replace("<image>", image_repl.full, 1) for t in text]
         return text, image_inputs
 
-    def _make_batch_input(self, input_item: Optional[Union[Any, list[Any]]] = None):
+    def _make_batch_input(self, input_item: Any | list[Any] | None = None):
         if input_item is None:
             input_item = []
         if not isinstance(input_item, list):
@@ -363,10 +363,10 @@ class BaseNanoNemotronVLProcessor(ABC):
 
     def __call__(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        max_num_tiles: Optional[int] = None,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        return_tensors: str | TensorType | None = None,
+        max_num_tiles: int | None = None,
     ) -> BatchFeature:
         # Use default if not provided
         if max_num_tiles is None:
@@ -399,12 +399,12 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
-        max_num_tiles: Optional[int] = None,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        video_token: Optional[str] = None,
-        video_pruning_rate: Optional[float] = None,
+        max_num_tiles: int | None = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        video_token: str | None = None,
+        video_pruning_rate: float | None = None,
     ) -> None:
         super().__init__(
             config=config,
@@ -423,7 +423,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
         return self.video_token_id is not None
 
     @property
-    def video_token_id(self) -> Optional[int]:
+    def video_token_id(self) -> int | None:
         if self.video_token is None:
             return None
         return self.tokenizer.get_vocab().get(self.video_token, None)
@@ -436,7 +436,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
         self,
         videos: list[npt.NDArray],
         max_num_tiles: int,
-        dynamic_image_size: Optional[bool] = None,
+        dynamic_image_size: bool | None = None,
     ) -> list[torch.Tensor]:
         return [
             video_to_pixel_values(
@@ -453,7 +453,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
         text: list[str],
         videos: list[npt.NDArray],
         max_num_tiles: int,
-        dynamic_image_size: Optional[bool] = None,
+        dynamic_image_size: bool | None = None,
     ):
         if len(videos) == 0 or not self.supports_video:
             video_inputs = {}
@@ -508,12 +508,12 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
 
     def __call__(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        max_num_tiles: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        videos: npt.NDArray | list[npt.NDArray] | None = None,
+        return_tensors: str | TensorType | None = None,
+        max_num_tiles: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> BatchFeature:
         # Use default if not provided
         if max_num_tiles is None:
@@ -545,7 +545,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
@@ -598,7 +598,7 @@ class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
     ) -> BaseNanoNemotronVLProcessor:
         raise NotImplementedError
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -607,7 +607,7 @@ class BaseNanoNemotronVLProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         max_num_tiles: int,
-        processor: Optional[BaseNanoNemotronVLProcessor],
+        processor: BaseNanoNemotronVLProcessor | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -673,10 +673,10 @@ class NanoNemotronVLProcessingInfo(BaseNanoNemotronVLProcessingInfo):
         video_limit = {"video": None} if self.supports_video else {}
         return {**super().get_supported_mm_limits(), **video_limit}
 
-    def get_video_token(self) -> Optional[str]:
+    def get_video_token(self) -> str | None:
         return IMG_CONTEXT
 
-    def get_video_pruning_rate(self) -> Optional[float]:
+    def get_video_pruning_rate(self) -> float | None:
         return self.ctx.get_mm_config().video_pruning_rate
 
     def get_num_frames_with_most_features(
@@ -929,7 +929,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         # Use default max_num_tiles for dummy data generation
         max_num_tiles = 12
@@ -964,7 +964,7 @@ class NanoNemotronVLDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         dummy_image = super().get_dummy_mm_data(
             seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options
@@ -1000,7 +1000,7 @@ class NemotronH_Nano_VL_V2(
     nn.Module, HasInnerState, IsHybrid, SupportsMultiModal, SupportsMultiModalPruning
 ):
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
         if modality.startswith("video"):
@@ -1097,7 +1097,7 @@ class NemotronH_Nano_VL_V2(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[NanoNemotronVLImageInputs]:
+    ) -> NanoNemotronVLImageInputs | None:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -1274,7 +1274,7 @@ class NemotronH_Nano_VL_V2(
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[NanoNemotronVLVideoPixelInputs]:
+    ) -> NanoNemotronVLVideoPixelInputs | None:
         pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None)
         video_num_patches = kwargs.pop("video_num_patches", None)
         video_embeds = kwargs.pop("video_embeds", None)
@@ -1365,10 +1365,10 @@ class NemotronH_Nano_VL_V2(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             input_ids = None
             inputs_embeds = None
@@ -1396,7 +1396,7 @@ class NemotronH_Nano_VL_V2(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 8f07a2cf12f7a..845798b18d1b3 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -26,7 +26,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -85,7 +85,7 @@ def _cast_if_autocast_enabled(*args):
 class NemotronLayerNorm1P(nn.LayerNorm):
     def __init__(
         self,
-        normalized_shape: Union[int, list[int], torch.Size],
+        normalized_shape: int | list[int] | torch.Size,
         eps: float = 1e-5,
         elementwise_affine: bool = True,
         bias: bool = True,
@@ -97,7 +97,7 @@ class NemotronLayerNorm1P(nn.LayerNorm):
     def forward(
         self,
         x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
+        residual: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if residual is not None:
             x = x + residual
@@ -116,7 +116,7 @@ class NemotronMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
@@ -152,11 +152,11 @@ class NemotronAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -238,8 +238,8 @@ class NemotronDecoderLayer(nn.Module):
     def __init__(
         self,
         config: NemotronConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -292,7 +292,7 @@ class NemotronDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -363,11 +363,11 @@ class NemotronModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -514,9 +514,9 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         model_output = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -525,7 +525,7 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 0a05c63a31ea2..a591f0b01c4e8 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -19,7 +19,6 @@
 """Inference-only NemotronH model."""
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
@@ -75,7 +74,7 @@ class NemotronHMLP(nn.Module):
         self,
         config: NemotronHConfig,
         layer_idx: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
@@ -119,9 +118,9 @@ class NemotronHMLPDecoderLayer(nn.Module):
         self,
         config: NemotronHConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -140,7 +139,7 @@ class NemotronHMLPDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -158,9 +157,9 @@ class NemotronHMambaDecoderLayer(nn.Module):
         self,
         config: NemotronHConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -188,7 +187,7 @@ class NemotronHMambaDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -207,9 +206,9 @@ class NemotronHAttention(nn.Module):
         self,
         config: NemotronHConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -279,9 +278,9 @@ class NemotronHAttentionDecoderLayer(nn.Module):
         self,
         config: NemotronHConfig,
         layer_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -301,7 +300,7 @@ class NemotronHAttentionDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -377,8 +376,8 @@ class NemotronHModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -553,8 +552,8 @@ class NemotronHForCausalLM(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ):
         hidden_states = self.model(
@@ -566,7 +565,7 @@ class NemotronHForCausalLM(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index ddd623b5de237..17e009612df43 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -26,7 +26,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -84,12 +84,12 @@ class DeciLMAttention(LlamaAttention):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         bias_o_proj: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -112,8 +112,8 @@ class DeciLMAttention(LlamaAttention):
     def _init_rotary_emb(
         self,
         config,
-        rope_scaling: Optional[dict[str, Any]],
-        quant_config: Optional[QuantizationConfig],
+        rope_scaling: dict[str, Any] | None,
+        quant_config: QuantizationConfig | None,
     ) -> None:
         # Enables YARN for Mistral and LLaMA4 derivatives.
         is_neox_style = True
@@ -139,8 +139,8 @@ class DeciLMDecoderLayer(nn.Module):
         self,
         config: LlamaConfig,
         layer_idx: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -210,7 +210,7 @@ class DeciLMDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
 
@@ -303,11 +303,11 @@ class DeciModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -487,9 +487,9 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         model_output = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -498,7 +498,7 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 268644bc92499..9e1323f41ee08 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -9,7 +9,6 @@
 # --------------------------------------------------------
 from abc import ABC
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -207,9 +206,9 @@ class NemotronVLProcessor(InternVLProcessor):
         tokenizer: AnyTokenizer,
         image_processor: BaseImageProcessorFast,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> None:
         ABC.__init__(self)
         self.config = config
@@ -266,9 +265,9 @@ class NemotronVLProcessor(InternVLProcessor):
     def _images_to_pixel_values_lst(
         self,
         images: list[Image.Image],
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> list[torch.Tensor]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -292,9 +291,9 @@ class NemotronVLProcessor(InternVLProcessor):
         self,
         text: list[str],
         images: list[Image.Image],
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> tuple[list[str], dict[str, torch.Tensor]]:
         if len(images) == 0:
             image_inputs = {}
@@ -326,7 +325,7 @@ class NemotronVLProcessor(InternVLProcessor):
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
@@ -362,7 +361,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
     merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -426,7 +425,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
     def _init_vision_model(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         prefix: str,
     ):
@@ -482,7 +481,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[InternVLImageInputs]:
+    ) -> InternVLImageInputs | None:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -581,9 +580,9 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         if multimodal_embeddings is not None and len(multimodal_embeddings) > 0:
@@ -604,8 +603,8 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
         if intermediate_tensors is not None:
@@ -630,7 +629,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index f17bf3b09d5be..73dd8dfd0f85d 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -8,7 +8,6 @@
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Mapping, Sequence
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -49,7 +48,7 @@ class NVLMProcessor(BaseInternVLProcessor):
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         if num_patches is None:
             raise NotImplementedError("Embedding inputs are not supported")
@@ -93,7 +92,7 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo])
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         target_width, target_height = self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
@@ -189,7 +188,7 @@ class NVLM_D_Model(InternVLChatModel):
     def _init_vision_model(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         is_mono: bool,
         prefix: str,
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index f334bbf9feeb5..1e1a1293136f4 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -26,7 +26,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -72,8 +71,8 @@ class OlmoAttention(nn.Module):
     def __init__(
         self,
         config: OlmoConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -152,7 +151,7 @@ class OlmoMLP(nn.Module):
     def __init__(
         self,
         config: OlmoConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -201,8 +200,8 @@ class OlmoDecoderLayer(nn.Module):
     def __init__(
         self,
         config: OlmoConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -226,7 +225,7 @@ class OlmoDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]:
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
         # Attention block.
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
@@ -276,9 +275,9 @@ class OlmoModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         """
         :param input_ids: A tensor of shape `(batch_size, seq_len)`.
         """
@@ -389,9 +388,9 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids=input_ids,
             positions=positions,
@@ -403,7 +402,7 @@ class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 79234cc4dd8de..a0ae9923ad76e 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -27,7 +27,6 @@
 from collections.abc import Iterable
 from functools import partial
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -312,9 +311,9 @@ class Olmo2Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         """
         :param input_ids: A tensor of shape `(batch_size, seq_len)`.
         """
@@ -429,9 +428,9 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids=input_ids,
             positions=positions,
@@ -443,7 +442,7 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 0e4b408775f5f..06307ae22c1b9 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -17,7 +17,6 @@
 from collections.abc import Iterable
 from functools import partial
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -77,9 +76,9 @@ class OlmoeMoE(nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        tp_size: Optional[int] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -245,7 +244,7 @@ class OlmoeDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
@@ -302,9 +301,9 @@ class OlmoeModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -483,9 +482,9 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index eadfea6084e5e..d124b7671b9cf 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -22,7 +22,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -77,8 +76,8 @@ class OPTAttention(nn.Module):
         embed_dim: int,
         num_heads: int,
         bias: bool = True,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -129,8 +128,8 @@ class OPTDecoderLayer(nn.Module):
     def __init__(
         self,
         config: OPTConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -202,8 +201,8 @@ class OPTDecoder(nn.Module):
     def __init__(
         self,
         config: OPTConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -270,9 +269,9 @@ class OPTDecoder(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
                 inputs_embeds = self.get_input_embeddings(input_ids)
@@ -319,9 +318,9 @@ class OPTModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         return self.decoder(
             input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
         )
@@ -402,9 +401,9 @@ class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -413,7 +412,7 @@ class OPTForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 0ce1729389553..cfe4d03334182 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -9,7 +9,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -51,7 +51,7 @@ class OrionMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -80,10 +80,10 @@ class OrionAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -158,8 +158,8 @@ class OrionDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -250,9 +250,9 @@ class OrionModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -340,9 +340,9 @@ class OrionForCausalLM(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -351,7 +351,7 @@ class OrionForCausalLM(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 12ed7b4c2ed03..dd7cbf54857f1 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -20,7 +20,7 @@
 
 import math
 from collections.abc import Iterable, Mapping
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
@@ -87,7 +87,7 @@ class VisualTokenizer(torch.nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -114,7 +114,7 @@ class VisualTokenizer(torch.nn.Module):
     def _init_backbone(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> nn.Module:
         model_type = config.backbone_config.model_type
@@ -282,7 +282,7 @@ class OvisProcessingInfo(BaseProcessingInfo):
         text_model_type = hf_text_config.model_type
         return IMAGE_PAD_TOKEN_MAP.get(text_model_type)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_image_size_with_most_features(self) -> ImageSize:
@@ -302,7 +302,7 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
@@ -417,7 +417,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
     merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -453,7 +453,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[OvisImagePatchInputs]:
+    ) -> OvisImagePatchInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         indicator_tokens = kwargs.pop("indicator_tokens", None)
 
@@ -527,10 +527,10 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -547,7 +547,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.llm.compute_logits(hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
index bb4fb1d17c151..b4e2f42be5979 100644
--- a/vllm/model_executor/models/ovis2_5.py
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -4,7 +4,7 @@
 
 from collections.abc import Iterable, Mapping
 from functools import partial
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
@@ -102,7 +102,7 @@ class VisualTokenizer(torch.nn.Module):
         self,
         config: PretrainedConfig,
         visual_vocab_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -129,7 +129,7 @@ class VisualTokenizer(torch.nn.Module):
     def _init_backbone(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -205,7 +205,7 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self) -> BaseImageProcessor:
         return self.get_hf_processor().image_processor  # type: ignore
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": 1}
 
     def get_image_size_with_most_features(self) -> ImageSize:
@@ -274,7 +274,7 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         num_frames: int,
-        image_processor: Optional[BaseImageProcessor],
+        image_processor: BaseImageProcessor | None,
     ) -> int:
         num_video_tokens = self.get_num_image_tokens(
             image_width=image_width, image_height=image_height, num_frames=num_frames
@@ -305,7 +305,7 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -482,7 +482,7 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Ovis2_5ImagePatchInputs]:
+    ) -> Ovis2_5ImagePatchInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         indicator_tokens = kwargs.pop("indicator_tokens", None)
         grids = kwargs.pop("grids", None)
@@ -516,7 +516,7 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[Ovis2_5VideoPatchInputs]:
+    ) -> Ovis2_5VideoPatchInputs | None:
         pixel_values = kwargs.pop("video_pixel_values", None)
         indicator_tokens = kwargs.pop("video_indicator_tokens", None)
         grids = kwargs.pop("video_grids", None)
@@ -549,7 +549,7 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
         raise AssertionError("This line should be unreachable.")
 
     def _process_visual_input(
-        self, visual_input: Union[Ovis2_5ImagePatchInputs, Ovis2_5VideoPatchInputs]
+        self, visual_input: Ovis2_5ImagePatchInputs | Ovis2_5VideoPatchInputs
     ) -> MultiModalEmbeddings:
         image_patches_flat = visual_input["flat_data"]
         patches_per_image = visual_input["patches_per_item"]
@@ -629,10 +629,10 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -649,7 +649,7 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.llm.compute_logits(hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 7bddfc5ee855b..fb0b4b2904675 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 from torch import nn
@@ -74,7 +74,9 @@ class PaliGemmaImageEmbeddingInputs(TensorSchema):
     data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
-PaliGemmaImageInputs = Union[PaliGemmaImagePixelInputs, PaliGemmaImageEmbeddingInputs]
+PaliGemmaImageInputs: TypeAlias = (
+    PaliGemmaImagePixelInputs | PaliGemmaImageEmbeddingInputs
+)
 
 
 class PaliGemmaMultiModalProjector(nn.Module):
@@ -95,7 +97,7 @@ class PaliGemmaProcessingInfo(BaseProcessingInfo):
     def get_vision_encoder_info(self):
         return get_vision_encoder_info(self.get_hf_config())
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": 1}
 
     def get_num_image_tokens(
@@ -120,7 +122,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.vision_config
@@ -217,11 +219,11 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
 
     def apply(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        tokenization_kwargs: Mapping[str, object] | None = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
         mm_inputs = super().apply(
             prompt,
@@ -273,7 +275,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -317,7 +319,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[PaliGemmaImageInputs]:
+    ) -> PaliGemmaImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -386,8 +388,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
         if intermediate_tensors is not None:
@@ -402,7 +404,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index d3df5f9a59b58..2c62f6862cf25 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -25,7 +25,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -63,7 +62,7 @@ from .utils import (
 
 class PersimmonMLP(nn.Module):
     def __init__(
-        self, config: PersimmonConfig, quant_config: Optional[QuantizationConfig] = None
+        self, config: PersimmonConfig, quant_config: QuantizationConfig | None = None
     ):
         super().__init__()
         self.dense_h_to_4h = ColumnParallelLinear(
@@ -85,8 +84,8 @@ class PersimmonAttention(nn.Module):
     def __init__(
         self,
         config: PersimmonConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -181,8 +180,8 @@ class PersimmonDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PersimmonConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -263,9 +262,9 @@ class PersimmonModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -340,8 +339,8 @@ class PersimmonForCausalLM(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ):
         hidden_states = self.model(
             input_ids=input_ids,
@@ -354,7 +353,7 @@ class PersimmonForCausalLM(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 779b391008bb5..6adcaf5084cbe 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -40,7 +40,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -80,8 +79,8 @@ class PhiAttention(nn.Module):
     def __init__(
         self,
         config: PhiConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -149,7 +148,7 @@ class PhiAttention(nn.Module):
 
 class PhiMLP(nn.Module):
     def __init__(
-        self, config: PhiConfig, quant_config: Optional[QuantizationConfig] = None
+        self, config: PhiConfig, quant_config: QuantizationConfig | None = None
     ):
         super().__init__()
 
@@ -179,8 +178,8 @@ class PhiLayer(nn.Module):
     def __init__(
         self,
         config: PhiConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -241,9 +240,9 @@ class PhiModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -348,9 +347,9 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -360,7 +359,7 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states, self.lm_head.bias)
         return logits
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index d972604db9cd2..93cc7af176d21 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import regex as re
 import torch
@@ -96,7 +96,7 @@ CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
 
 def _init_img_processor(
     hf_config: PretrainedConfig,
-    quant_config: Optional[QuantizationConfig],
+    quant_config: QuantizationConfig | None,
     prefix: str = "",
 ) -> CLIPVisionModel:
     clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
@@ -132,14 +132,14 @@ class Phi3VImagePixelInputs(TensorSchema):
 
     # Supports either a stacked tensor or a list of (p, 3, h, w) tensors
     pixel_values: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape(
             "bn", "p", 3, "h", "w", dynamic_dims={"p"}
         ),  # 'p' may vary across items
     ]
 
     # Stacked tensor with height and width for each image
-    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
+    image_sizes: Annotated[torch.Tensor | None, TensorShape("bn", 2)]
 
 
 class Phi3VImageEmbeddingInputs(TensorSchema):
@@ -153,12 +153,12 @@ class Phi3VImageEmbeddingInputs(TensorSchema):
 
     type: Literal["image_embeds"] = "image_embeds"
     data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "f", "h"),
     ]
 
 
-Phi3VImageInputs = Union[Phi3VImagePixelInputs, Phi3VImageEmbeddingInputs]
+Phi3VImageInputs: TypeAlias = Phi3VImagePixelInputs | Phi3VImageEmbeddingInputs
 
 
 class Phi3ImageEmbeddingBase(nn.Module):
@@ -192,7 +192,7 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -350,7 +350,7 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
 
 
 class Phi3VProcessingInfo(BaseProcessingInfo):
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -358,7 +358,7 @@ class Phi3VProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[ProcessorMixin] = None,
+        processor: ProcessorMixin | None = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -386,7 +386,7 @@ class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
@@ -574,7 +574,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return f"<|image_{i}|>"
 
@@ -620,7 +620,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Phi3VImageInputs]:
+    ) -> Phi3VImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -684,9 +684,9 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         inputs_embeds = self._get_text_embeddings(
@@ -716,8 +716,8 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ):
         if intermediate_tensors is not None:
@@ -732,7 +732,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
index 002233d0677b0..b99e3a5a1fd84 100644
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import numpy as np
 import torch
@@ -147,7 +147,7 @@ class Phi4MMImageEmbedding(nn.Module):
     def get_img_features(
         self,
         img_embeds: torch.FloatTensor,
-        attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         img_feature = self.img_processor(
             img_embeds, patch_attention_mask=attention_mask
@@ -172,8 +172,8 @@ class Phi4MMImageEmbedding(nn.Module):
     def forward(
         self,
         image_pixel_values: torch.FloatTensor,
-        image_sizes: Optional[torch.Tensor] = None,
-        image_attention_mask: Optional[torch.Tensor] = None,
+        image_sizes: torch.Tensor | None = None,
+        image_attention_mask: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         image_pixel_values = image_pixel_values.to(
             self.img_processor.embeddings.patch_embedding.weight.dtype
@@ -278,7 +278,7 @@ class Phi4MultimodalAudioMLP(nn.Module):
     def __init__(
         self,
         config: Phi4MultimodalAudioConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -311,7 +311,7 @@ class Phi4MultimodalAudioAttention(nn.Module):
     def __init__(
         self,
         config: Phi4MultimodalAudioConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -522,7 +522,7 @@ class Phi4MultimodalAudioModel(nn.Module):
         pad_mask = pad_mask & enc_streaming_mask
         return pad_mask
 
-    def forward(self, hidden_states: torch.Tensor, mask: Optional[torch.Tensor] = None):
+    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor | None = None):
         hidden_states = self.encoder_embedding(hidden_states)
         hidden_states, hs_mask, mask = self.forward_embeddings(hidden_states, mask)
 
@@ -673,7 +673,7 @@ class Phi4MMImagePixelInputs(TensorSchema):
     type: Literal["pixel_values"]
 
     data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape(
             "bn", "p", 3, "h", "w", dynamic_dims={"p"}
         ),  # may be different per batch and image
@@ -706,7 +706,7 @@ class Phi4MMImageEmbeddingInputs(TensorSchema):
     type: Literal["image_embeds"]
 
     data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "f", "h"),
     ]
 
@@ -722,7 +722,7 @@ class Phi4MMAudioFeatureInputs(TensorSchema):
     type: Literal["audio_features"]
 
     data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "t", 80, dynamic_dims={"t"}),
     ]
 
@@ -744,8 +744,8 @@ class Phi4MMAudioEmbeddingInputs(TensorSchema):
     ]
 
 
-Phi4MMImageInput = Union[Phi4MMImagePixelInputs, Phi4MMImageEmbeddingInputs]
-Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
+Phi4MMImageInput: TypeAlias = Phi4MMImagePixelInputs | Phi4MMImageEmbeddingInputs
+Phi4MMAudioInputs: TypeAlias = Phi4MMAudioFeatureInputs | Phi4MMAudioEmbeddingInputs
 
 
 def cat_with_pad(tensors, dim, padding_value=0):
@@ -786,7 +786,7 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
 
     def get_image_processor(
         self,
-        processor: Optional[Phi4MMProcessor] = None,
+        processor: Phi4MMProcessor | None = None,
     ) -> Phi4MultimodalImageProcessorFast:
         if processor is None:
             processor = self.get_hf_processor()
@@ -794,11 +794,11 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
 
     def get_dynamic_hd(
         self,
-        processor: Optional[Phi4MMProcessor] = None,
+        processor: Phi4MMProcessor | None = None,
     ) -> int:
         return self.get_image_processor(processor).dynamic_hd
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None, "image": None}
 
     def _find_target_aspect_ratio(
@@ -936,7 +936,7 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[Phi4MMProcessor] = None,
+        processor: Phi4MMProcessor | None = None,
     ) -> int:
         hf_config = self.get_hf_config()
         vision_config = hf_config.vision_config
@@ -959,7 +959,7 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
 
     def get_image_size_with_most_features(
         self,
-        processor: Optional[Phi4MMProcessor] = None,
+        processor: Phi4MMProcessor | None = None,
     ) -> ImageSize:
         vit_image_size = self.get_hf_config().vision_config.image_size
 
@@ -1038,7 +1038,7 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
@@ -1216,7 +1216,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|image|>"
         if modality.startswith("audio"):
@@ -1253,7 +1253,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
 
     def _parse_and_validate_audio_input(
         self, **kwargs: object
-    ) -> Optional[Phi4MMAudioInputs]:
+    ) -> Phi4MMAudioInputs | None:
         """
         Parse and validate the audio input to the model.  This handles both
         audio features and audio embeddings, but only the former is used for
@@ -1314,7 +1314,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Phi4MMImagePixelInputs]:
+    ) -> Phi4MMImagePixelInputs | None:
         image_pixel_values: NestedTensors = kwargs.get("image_pixel_values")
         if image_pixel_values is None:
             return None
@@ -1445,8 +1445,8 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
@@ -1464,7 +1464,7 @@ class Phi4MultimodalForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 981f9b37846fe..dce31f9d0aac6 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import numpy as np
 import torch
@@ -122,7 +122,7 @@ class Phi4MMImageEncoder(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         prefix: str = "",
         model_dir: str = "",
     ) -> None:
@@ -468,7 +468,7 @@ class Phi4MMImagePixelInputs(TensorSchema):
     type: Literal["pixel_values"]
 
     data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape(
             "bn", "p", 3, "h", "w", dynamic_dims={"p"}
         ),  # may be different per batch and image
@@ -500,7 +500,7 @@ class Phi4MMAudioFeatureInputs(TensorSchema):
     type: Literal["audio_features"]
 
     data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", "t", 80, dynamic_dims={"t"}),
     ]
 
@@ -521,7 +521,7 @@ class Phi4MMAudioEmbeddingInputs(TensorSchema):
     ]
 
 
-Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
+Phi4MMAudioInputs: TypeAlias = Phi4MMAudioFeatureInputs | Phi4MMAudioEmbeddingInputs
 
 
 def cat_with_pad(tensors, dim, padding_value=0):
@@ -561,7 +561,7 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
 
     def get_dynamic_hd(
         self,
-        processor: Optional[ProcessorMixin] = None,
+        processor: ProcessorMixin | None = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -571,7 +571,7 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
     def get_feature_extractor(self, **kwargs: object) -> SequenceFeatureExtractor:
         return self.get_hf_processor(**kwargs).audio_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None, "image": None}
 
     def _find_target_aspect_ratio(
@@ -709,7 +709,7 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[ProcessorMixin] = None,
+        processor: ProcessorMixin | None = None,
     ) -> int:
         hf_config = self.get_hf_config()
         vision_encoder_name = hf_config.img_processor
@@ -735,7 +735,7 @@ class Phi4MMProcessingInfo(BaseProcessingInfo):
 
     def get_image_size_with_most_features(
         self,
-        processor: Optional[ProcessorMixin] = None,
+        processor: ProcessorMixin | None = None,
     ) -> ImageSize:
         hf_config = self.get_hf_config()
         vision_encoder_name = hf_config.img_processor
@@ -819,7 +819,7 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
@@ -1008,7 +1008,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return f"<|image_{i}|>"
         if modality.startswith("audio"):
@@ -1074,7 +1074,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
 
     def _parse_and_validate_audio_input(
         self, **kwargs: object
-    ) -> Optional[Phi4MMAudioInputs]:
+    ) -> Phi4MMAudioInputs | None:
         """
         Parse and validate the audio input to the model.  This handles both
         audio features and audio embeddings, but only the former is used for
@@ -1135,7 +1135,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Phi4MMImagePixelInputs]:
+    ) -> Phi4MMImagePixelInputs | None:
         input_image_embeds: NestedTensors = kwargs.get("input_image_embeds")
         if input_image_embeds is None:
             return None
@@ -1263,8 +1263,8 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
@@ -1282,7 +1282,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
index d289e26efa10f..493fdb465fbad 100644
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -7,7 +7,7 @@
 #!/usr/bin/env python3
 import abc
 import math
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal
 
 import numpy as np
 import torch
@@ -221,7 +221,7 @@ class ConformerEncoderLayer(nn.Module):
         pos_k: torch.Tensor,
         pos_v: torch.Tensor,
         mask: torch.Tensor,
-        relative_attention_bias: Optional[Tensor] = None,
+        relative_attention_bias: Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """ConformerEncoder forward.
 
@@ -329,8 +329,8 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
     def __init__(
         self,
         input_size: int,
-        chunk_size: Union[int, list[int]],
-        left_chunk: Union[int, list[int]],
+        chunk_size: int | list[int],
+        left_chunk: int | list[int],
         attention_dim: int = 256,
         attention_heads: int = 4,
         input_layer: str = "nemo_conv",
@@ -339,12 +339,12 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
         time_reduction: int = 4,
         dropout_rate: float = 0.0,
         padding_idx: int = -1,
-        relative_attention_bias_args: Optional[dict[str, Any]] = None,
+        relative_attention_bias_args: dict[str, Any] | None = None,
         positional_dropout_rate: float = 0.0,
-        nemo_conv_settings: Optional[dict[str, Any]] = None,
+        nemo_conv_settings: dict[str, Any] | None = None,
         conv2d_extra_padding: Literal["feat", "feat_time", "none", True] = "none",
         attention_group_size: int = 1,
-        encoder_embedding_config: Optional[dict[str, Any]] = None,
+        encoder_embedding_config: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.input_size = input_size
@@ -411,8 +411,8 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
         )
 
     def compute_lens_change(
-        self, feature_lens: Union[int, torch.Tensor]
-    ) -> Union[int, torch.Tensor]:
+        self, feature_lens: int | torch.Tensor
+    ) -> int | torch.Tensor:
         """feature_lens: int
         return updated feature lens.
 
@@ -452,8 +452,8 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
 
     def _chunk_size_selection(
         self,
-        chunk_size: Optional[Union[int, list[int]]] = None,
-        left_chunk: Optional[Union[int, list[int]]] = None,
+        chunk_size: int | list[int] | None = None,
+        left_chunk: int | list[int] | None = None,
     ) -> tuple[int, int]:
         """If chunk size is a list, we will randomly select a chunk size."""
 
@@ -503,7 +503,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
 
     def _position_embedding(
         self, input_tensor: torch.Tensor
-    ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor | None, torch.Tensor | None]:
         pos_k = None
         pos_v = None
         if self.relative_attention_bias_layer is None:
@@ -516,8 +516,8 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
         self,
         seq_len: int,
         batch_size: int,
-        chunk_size: Union[int, list[int]],
-        left_chunk: Union[int, list[int]],
+        chunk_size: int | list[int],
+        left_chunk: int | list[int],
     ) -> torch.Tensor:
         chunk_size_train_eff, left_chunk_train_eff = self._chunk_size_selection(
             chunk_size, left_chunk
@@ -540,25 +540,25 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
         self,
         xs_pad: torch.Tensor,
         masks: torch.Tensor,
-        chunk_size_nc: Optional[Union[int, list[int]]] = None,
-        left_chunk_nc: Optional[Union[int, list[int]]] = None,
-    ) -> Union[
+        chunk_size_nc: int | list[int] | None = None,
+        left_chunk_nc: int | list[int] | None = None,
+    ) -> (
         tuple[
             torch.Tensor,
-            Optional[torch.Tensor],
-            Optional[torch.Tensor],
+            torch.Tensor | None,
+            torch.Tensor | None,
             torch.Tensor,
             torch.Tensor,
-        ],
-        tuple[
+        ]
+        | tuple[
             torch.Tensor,
-            Optional[torch.Tensor],
-            Optional[torch.Tensor],
+            torch.Tensor | None,
+            torch.Tensor | None,
             torch.Tensor,
             torch.Tensor,
             torch.Tensor,
-        ],
-    ]:
+        ]
+    ):
         """Forwarding the inputs through the top embedding layers
 
         Args:
@@ -803,9 +803,9 @@ class ConformerEncoder(TransformerEncoderBase):
     def __init__(  # pylint: disable-all
         self,
         input_size: int,
-        chunk_size: Union[int, list[int]],
-        left_chunk: Union[int, list[int]],
-        num_lang: Optional[int] = None,
+        chunk_size: int | list[int],
+        left_chunk: int | list[int],
+        num_lang: int | None = None,
         attention_dim: int = 256,
         attention_heads: int = 4,
         linear_units: int = 2048,
@@ -832,14 +832,14 @@ class ConformerEncoder(TransformerEncoderBase):
         extra_layer_output_idx: int = -1,
         extra_multi_layer_output_idxs: list[int] = [],  # noqa
         activation_checkpointing: str = "",
-        relative_attention_bias_args: Optional[dict[str, Any]] = None,
+        relative_attention_bias_args: dict[str, Any] | None = None,
         time_reduction: int = 4,
         use_pt_scaled_dot_product_attention: bool = False,
-        nemo_conv_settings: Optional[dict[str, Any]] = None,
+        nemo_conv_settings: dict[str, Any] | None = None,
         conv2d_extra_padding: Literal["feat", "feat_time", "none", True] = "none",
         replication_pad_for_subsample_embedding: bool = False,
         attention_group_size: int = 1,
-        encoder_embedding_config: Optional[dict[str, Any]] = None,
+        encoder_embedding_config: dict[str, Any] | None = None,
     ) -> None:
         super().__init__(
             input_size,
@@ -908,12 +908,12 @@ class ConformerEncoder(TransformerEncoderBase):
 
     def init_relative_attention_bias(
         self, input_tensor: torch.Tensor
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         if self.relative_attention_bias_layer:
             return self.relative_attention_bias_layer(input_tensor)
 
     def calculate_hs_mask(
-        self, xs_pad: torch.Tensor, device: torch.device, mask: Optional[torch.Tensor]
+        self, xs_pad: torch.Tensor, device: torch.device, mask: torch.Tensor | None
     ) -> torch.Tensor:
         max_audio_length = xs_pad.shape[1]
         batch_size = xs_pad.shape[0]
@@ -1066,9 +1066,9 @@ class WindowQformer(nn.Module):
     def forward(
         self,
         audio_embed: torch.Tensor,
-        mask: Optional[torch.Tensor],
-        embed_len: Optional[int] = None,
-    ) -> tuple[torch.Tensor, Optional[int]]:
+        mask: torch.Tensor | None,
+        embed_len: int | None = None,
+    ) -> tuple[torch.Tensor, int | None]:
         """forward decoder"""
         # audio_embed: N x T x D => N x D x T
 
@@ -1224,7 +1224,7 @@ class AudioEmbedding(nn.Module):
     def get_audio_features(
         self,
         input_embeds: torch.Tensor,
-        audio_attention_mask: Optional[torch.Tensor] = None,
+        audio_attention_mask: torch.Tensor | None = None,
         audio_projection_mode: str = "speech",
     ) -> torch.Tensor:
         """
@@ -1278,7 +1278,7 @@ class AudioEmbedding(nn.Module):
     def forward(
         self,
         audio_features: torch.Tensor,
-        audio_attention_mask: Optional[torch.Tensor] = None,
+        audio_attention_mask: torch.Tensor | None = None,
         audio_projection_mode: str = "speech",
     ) -> torch.Tensor:
         """
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
index d50547c199ac5..698435eb76c96 100644
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -6,7 +6,6 @@
 # but implemented by the Phi-Speech team
 #!/usr/bin/env python3
 import math
-from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -917,7 +916,7 @@ class CausalConv1D(nn.Conv1d):
         out_channels: int,
         kernel_size: int,
         stride: int = 1,
-        padding: Union[str, int] = 0,
+        padding: str | int = 0,
         dilation: int = 1,
         groups: int = 1,
         bias: bool = True,
@@ -962,8 +961,8 @@ class CausalConv1D(nn.Conv1d):
         )
 
     def update_cache(
-        self, x: Tensor, cache: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, x: Tensor, cache: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         if cache is None:
             new_x = F.pad(x, pad=(self._left_padding, self._right_padding))
             next_cache = cache
@@ -978,8 +977,8 @@ class CausalConv1D(nn.Conv1d):
         return new_x, next_cache
 
     def forward(
-        self, x: Tensor, cache: Optional[Tensor] = None
-    ) -> Union[Tensor, tuple[Tensor, Optional[Tensor]]]:
+        self, x: Tensor, cache: Tensor | None = None
+    ) -> Tensor | tuple[Tensor, Tensor | None]:
         x, cache = self.update_cache(x, cache=cache)
         x = super().forward(x)
         if cache is None:
@@ -1002,7 +1001,7 @@ class CausalConv2D(nn.Conv2d):
         out_channels: int,
         kernel_size: int,
         stride: int = 1,
-        padding: Union[str, int] = 0,
+        padding: str | int = 0,
         dilation: int = 1,
         groups: int = 1,
         bias: bool = True,
@@ -1371,9 +1370,7 @@ class NemoConvSubsampling(torch.nn.Module):
     def get_streaming_cache_size(self) -> list[int]:
         return [0, self.subsampling_factor + 1]
 
-    def forward(
-        self, x: Tensor, mask: Optional[Tensor]
-    ) -> tuple[Tensor, Optional[Tensor]]:
+    def forward(self, x: Tensor, mask: Tensor | None) -> tuple[Tensor, Tensor | None]:
         """
         Forward method for NeMo subsampling.
 
@@ -1615,10 +1612,10 @@ class AttModule(nn.Module):
     def forward(
         self,
         x: Tensor,
-        memory: Optional[Tensor] = None,
-        pos_emb: Optional[Tensor] = None,
-        att_mask: Optional[Tensor] = None,
-    ) -> tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+        memory: Tensor | None = None,
+        pos_emb: Tensor | None = None,
+        att_mask: Tensor | None = None,
+    ) -> tuple[Tensor, Tensor, Tensor | None, Tensor | None]:
         """AttModule forward
 
         Args:
@@ -1640,7 +1637,7 @@ class AttBlock(BlockBase, AttModule):
 
 def masked_softmax(
     scores: Tensor,
-    mask: Optional[Tensor],
+    mask: Tensor | None,
 ) -> Tensor:
     if mask is not None:
         mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
@@ -1720,7 +1717,7 @@ class MultiHeadedAttention(nn.Module):
         self.linear_v = nn.Linear(n_value, attention_inner_dim // group_size)
         self.linear_out = nn.Linear(attention_inner_dim // group_size, n_value)
 
-        self.attn = torch.jit.Attribute(None, Optional[Tensor])
+        self.attn = torch.jit.Attribute(None, Tensor | None)
         self.dropout = nn.Dropout(p=dropout_rate)
         self.dropout_rate = dropout_rate
         self.use_pt_scaled_dot_product_attention = use_pt_scaled_dot_product_attention
@@ -1741,10 +1738,10 @@ class MultiHeadedAttention(nn.Module):
         query: Tensor,
         key: Tensor,
         value: Tensor,
-        pos_k: Optional[Tensor],
-        pos_v: Optional[Tensor],
-        mask: Optional[Tensor],
-        relative_attention_bias: Optional[Tensor] = None,
+        pos_k: Tensor | None,
+        pos_v: Tensor | None,
+        mask: Tensor | None,
+        relative_attention_bias: Tensor | None = None,
     ) -> Tensor:
         """Compute 'Scaled Dot Product Attention'.
 
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index fee52edfe26c8..2cd4d8c727216 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -26,7 +26,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -257,9 +256,9 @@ class PhiMoE(nn.Module):
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        tp_size: Optional[int] = None,
+        params_dtype: torch.dtype | None = None,
+        quant_config: QuantizationConfig | None = None,
+        tp_size: int | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -304,12 +303,12 @@ class PhiMoEAttention(nn.Module):
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        rope_scaling: Optional[dict] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        rope_scaling: dict | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -386,8 +385,8 @@ class PhiMoEDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PhiMoEConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -427,7 +426,7 @@ class PhiMoEDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         residual = hidden_states
 
@@ -496,9 +495,9 @@ class PhiMoEModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -674,9 +673,9 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 62f642eae4b52..0555717017cdc 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -5,7 +5,7 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass, fields
 from functools import cached_property
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal
 
 import torch
 import torch.nn as nn
@@ -100,7 +100,7 @@ class PixtralImagePixelInputs(TensorSchema):
     type: Literal["pixel_values"] = "pixel_values"
 
     images: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("bn", 3, "h", "w", dynamic_dims={"h", "w"}),
     ]
 
@@ -144,9 +144,9 @@ class PixtralProcessorAdapter:
 
     def __call__(
         self,
-        text: Optional[Union[TextInput, list[TextInput]]] = None,
-        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: TextInput | list[TextInput] | None = None,
+        images: ImageInput | list[ImageInput] | None = None,
+        return_tensors: str | TensorType | None = None,
         **kwargs,
     ) -> Mapping[str, NestedTensors]:
         if text is None:
@@ -203,12 +203,12 @@ class PixtralProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self) -> PixtralProcessorAdapter:
         return PixtralProcessorAdapter(self.get_tokenizer())
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_vision_config(
         self,
-        processor: Optional[PixtralProcessorAdapter] = None,
+        processor: PixtralProcessorAdapter | None = None,
     ):
         if processor is None:
             processor = self.get_hf_processor()
@@ -223,7 +223,7 @@ class PixtralProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[PixtralProcessorAdapter] = None,
+        processor: PixtralProcessorAdapter | None = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -249,7 +249,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
@@ -270,7 +270,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> ProcessorInputs:
         tokenizer = self.info.get_tokenizer()
 
@@ -342,11 +342,11 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo])
 
     def _cached_apply_hf_processor(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
@@ -369,7 +369,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -420,7 +420,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[PixtralImagePixelInputs]:
+    ) -> PixtralImagePixelInputs | None:
         images = kwargs.pop("images", None)
         if images is None:
             return None
@@ -472,10 +472,10 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for pixtral."""
         if intermediate_tensors is not None:
             inputs_embeds = None
@@ -489,7 +489,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
@@ -717,7 +717,7 @@ class Transformer(nn.Module):
         self,
         x: torch.Tensor,
         mask: torch.Tensor,
-        freqs_cis: Optional[torch.Tensor],
+        freqs_cis: torch.Tensor | None,
     ) -> torch.Tensor:
         for layer in self.layers:
             x = layer(x, mask=mask, freqs_cis=freqs_cis)
@@ -759,7 +759,7 @@ class VisionTransformer(nn.Module):
 
         head_dim = self.args.hidden_size // self.args.num_attention_heads
         assert head_dim % 2 == 0, "ROPE requires even head_dim"
-        self._freqs_cis: Optional[torch.Tensor] = None
+        self._freqs_cis: torch.Tensor | None = None
 
     @property
     def max_patches_per_side(self) -> int:
@@ -1015,7 +1015,7 @@ class PixtralHFMLP(nn.Module):
     def __init__(
         self,
         config: PixtralVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
     ) -> None:
@@ -1049,7 +1049,7 @@ class PixtralHFAttention(nn.Module):
     def __init__(
         self,
         config: PixtralVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
     ) -> None:
@@ -1084,7 +1084,7 @@ class PixtralHFAttention(nn.Module):
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
         position_embeddings: torch.Tensor,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         batch, patches, _ = hidden_states.size()
 
         qkv_states, _ = self.qkv_proj(hidden_states)
@@ -1119,7 +1119,7 @@ class PixtralHFTransformerBlock(nn.Module):
     def __init__(
         self,
         config: PixtralVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
         prefix: str = "",
     ) -> None:
@@ -1155,9 +1155,9 @@ class PixtralHFTransformer(nn.Module):
     def __init__(
         self,
         config: PixtralVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -1202,10 +1202,10 @@ class PixtralHFVisionModel(nn.Module):
     def __init__(
         self,
         config: PixtralVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -1247,8 +1247,8 @@ class PixtralHFVisionModel(nn.Module):
         self,
         pixel_values: list[torch.Tensor],
         *,
-        select_layers: Optional[list[int]] = None,
-        feature_select_strategy: Optional[VisionFeatureSelectStrategy] = None,
+        select_layers: list[int] | None = None,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
     ) -> tuple[torch.Tensor, ...]:
         """
         Args:
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 278957e7cf6ce..b35a8c6b66f26 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -4,7 +4,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -504,7 +504,7 @@ class DenseMLP(nn.Module):
     def __init__(
         self,
         config: Plamo2Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -672,7 +672,7 @@ class Plamo2DecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         **kwargs,
     ):
         if residual is None:
@@ -728,7 +728,7 @@ class Plamo2Decoder(torch.nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
@@ -770,8 +770,8 @@ class Plamo2Model(torch.nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -851,8 +851,8 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
     ):
         hidden_states = self.model(
@@ -901,7 +901,7 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 6a12776b7f94b..72e66d8f30384 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -10,7 +10,7 @@
 import json
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -55,7 +55,7 @@ class QWenMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str = "silu",
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -84,9 +84,9 @@ class QWenAttention(nn.Module):
         num_heads: int,
         max_position_embeddings: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        rope_scaling: dict[str, Any] | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -144,8 +144,8 @@ class QWenBlock(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -174,7 +174,7 @@ class QWenBlock(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -226,9 +226,9 @@ class QWenModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -288,7 +288,7 @@ class QWenBaseModel(nn.Module):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
@@ -357,9 +357,9 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.transformer(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index c8bc17dbfa0a1..b26546647ce76 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -27,7 +27,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -77,7 +77,7 @@ class Qwen2MLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -116,12 +116,12 @@ class Qwen2Attention(nn.Module):
         num_kv_heads: int,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        rope_scaling: Optional[tuple] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
-        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        dual_chunk_attention_config: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -210,8 +210,8 @@ class Qwen2DecoderLayer(nn.Module):
     def __init__(
         self,
         config: Qwen2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -261,7 +261,7 @@ class Qwen2DecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -362,9 +362,9 @@ class Qwen2Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -520,9 +520,9 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -531,7 +531,7 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 0df79fc733f3f..07f814ef64187 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -22,10 +22,10 @@
 # limitations under the License.
 """Inference-only Qwen2.5-Omni model (thinker part)."""
 
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from copy import copy
 from functools import partial
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+from typing import Annotated, Any, Literal
 
 import torch
 import torch.nn as nn
@@ -125,7 +125,7 @@ class Qwen2_5OmniAudioFeatureInputs(TensorSchema):
 
     type: Literal["audio_features"]
     input_features: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("nmb", "tsl"),
     ]
 
@@ -191,7 +191,7 @@ class Qwen2_5OmniThinkerMultiModalDataParser(Qwen2VLMultiModalDataParser):
 
     def _parse_audio_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
             return DictEmbeddingItems(
@@ -225,7 +225,7 @@ class Qwen2_5OmniThinkerProcessingInfo(
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None, "image": None, "video": None}
 
 
@@ -253,7 +253,7 @@ class Qwen2_5OmniThinkerDummyInputsBuilder(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
@@ -420,7 +420,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
         cls,
         thinker_config: PretrainedConfig,
         audio_len: int,
-        video_grid_thw: Union[list[int], torch.Tensor],
+        video_grid_thw: list[int] | torch.Tensor,
         video_second_per_grid_t: float,
     ) -> list[int]:
         """Get video prompt updates when `use_audio_in_video` is True.
@@ -580,7 +580,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
 
     def _apply_hf_processor_main(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
@@ -665,7 +665,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
 
     def _parse_and_validate_audio_input(
         self, **kwargs: object
-    ) -> Optional[Qwen2_5OmniAudioFeatureInputs]:
+    ) -> Qwen2_5OmniAudioFeatureInputs | None:
         input_audio_features = kwargs.pop("input_audio_features", None)
         audio_feature_lengths = kwargs.pop("audio_feature_lengths", None)
         feature_attention_mask = kwargs.pop("feature_attention_mask", None)
@@ -693,7 +693,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
     def _parse_and_validate_image_input(
         self,
         **kwargs: dict[str, Any],
-    ) -> Optional[Qwen2_5_VLImageInputs]:
+    ) -> Qwen2_5_VLImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -743,7 +743,7 @@ class Qwen2_5OmniConditionalGenerationMixin:
     def _parse_and_validate_video_input(
         self,
         **kwargs: dict[str, Any],
-    ) -> Optional[Qwen2_5_VLVideoInputs]:
+    ) -> Qwen2_5_VLVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -892,7 +892,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|IMAGE|><|vision_end|>"
         if modality.startswith("video"):
@@ -991,12 +991,12 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
         cls,
         input_tokens: list[int],
         hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
-        second_per_grid_ts: Optional[list[float]] = None,
+        image_grid_thw: list[list[int]] | torch.Tensor,
+        video_grid_thw: list[list[int]] | torch.Tensor,
+        second_per_grid_ts: list[float] | None = None,
         context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
+        seq_len: int | None = None,
+        audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
         """Get mrope input positions and delta value (Qwen2.5-Omni version).
@@ -1225,9 +1225,9 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         # This is to satisfy the type checker for each overload
@@ -1241,7 +1241,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
             handle_oov_mm_token=handle_oov_mm_token,
         )
 
-    def get_multimodal_embeddings_v0(self, **kwargs: object) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings_v0(self, **kwargs: object) -> NestedTensors | None:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         image_input = self._parse_and_validate_image_input(**kwargs)
         video_input = self._parse_and_validate_video_input(**kwargs)
@@ -1266,10 +1266,10 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -1281,7 +1281,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 094fd90aac4e5..3f205307cb225 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -26,9 +26,9 @@
 # limitations under the License.
 """Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
 
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import lru_cache, partial
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
@@ -161,9 +161,9 @@ class Qwen2_5_VLImageEmbeddingInputs(TensorSchema):
     ]
 
 
-Qwen2_5_VLImageInputs = Union[
-    Qwen2_5_VLImagePixelInputs, Qwen2_5_VLImageEmbeddingInputs
-]
+Qwen2_5_VLImageInputs: TypeAlias = (
+    Qwen2_5_VLImagePixelInputs | Qwen2_5_VLImageEmbeddingInputs
+)
 
 
 class Qwen2_5_VLVideoPixelInputs(TensorSchema):
@@ -197,7 +197,7 @@ class Qwen2_5_VLVideoPixelInputs(TensorSchema):
     ]
 
     second_per_grid_ts: Annotated[
-        Optional[torch.Tensor],
+        torch.Tensor | None,
         TensorShape("nv"),
     ]
 
@@ -231,9 +231,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
     ]
 
 
-Qwen2_5_VLVideoInputs = Union[
-    Qwen2_5_VLVideoPixelInputs, Qwen2_5_VLVideoEmbeddingInputs
-]
+Qwen2_5_VLVideoInputs: TypeAlias = (
+    Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs
+)
 
 # === Vision Encoder === #
 
@@ -245,7 +245,7 @@ class Qwen2_5_VisionMLP(nn.Module):
         hidden_features: int,
         bias: bool = False,
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -301,7 +301,7 @@ class Qwen2_5_VisionAttention(nn.Module):
         embed_dim: int,
         num_heads: int,
         projection_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend: _Backend = _Backend.TORCH_SDPA,
@@ -386,8 +386,8 @@ class Qwen2_5_VisionAttention(nn.Module):
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-        seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -466,8 +466,8 @@ class Qwen2_5_VisionBlock(nn.Module):
         num_heads: int,
         mlp_hidden_dim: int,
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend: _Backend = _Backend.TORCH_SDPA,
@@ -503,8 +503,8 @@ class Qwen2_5_VisionBlock(nn.Module):
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-        seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
@@ -552,9 +552,9 @@ class Qwen2_5_VisionPatchMerger(nn.Module):
         self,
         d_model: int,
         context_dim: int,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
         spatial_merge_size: int = 2,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -634,7 +634,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
         self,
         vision_config: Qwen2_5_VLVisionConfig,
         norm_eps: float = 1e-6,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -815,7 +815,7 @@ class Qwen2_5_VisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[Optional[int], Optional[list[int]]]:
+    ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
         if (
             self.attn_backend == _Backend.FLASH_ATTN
@@ -1080,12 +1080,12 @@ class Qwen2_5_VLForConditionalGeneration(
         cls,
         input_tokens: list[int],
         hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        image_grid_thw: list[list[int]] | torch.Tensor,
+        video_grid_thw: list[list[int]] | torch.Tensor,
         second_per_grid_ts: list[float],
         context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
+        seq_len: int | None = None,
+        audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
         """Get mrope input positions and delta value."""
@@ -1202,7 +1202,7 @@ class Qwen2_5_VLForConditionalGeneration(
         return llm_positions, mrope_position_delta
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|image_pad|><|vision_end|>"
         if modality.startswith("video"):
@@ -1273,7 +1273,7 @@ class Qwen2_5_VLForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Qwen2_5_VLImageInputs]:
+    ) -> Qwen2_5_VLImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -1311,7 +1311,7 @@ class Qwen2_5_VLForConditionalGeneration(
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[Qwen2_5_VLVideoInputs]:
+    ) -> Qwen2_5_VLVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -1605,10 +1605,10 @@ class Qwen2_5_VLForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for Qwen2.5-VL.
 
         Args:
@@ -1634,7 +1634,7 @@ class Qwen2_5_VLForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index e61a730f97bb6..553fdc4a9e179 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -24,7 +24,7 @@
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
@@ -78,7 +78,7 @@ class Qwen2AudioFeatureInputs(TensorSchema):
 
     type: Literal["audio_features"]
     input_features: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("na", "nmb", 3000),
     ]
 
@@ -105,7 +105,7 @@ class Qwen2AudioEmbeddingInputs(TensorSchema):
     ]
 
 
-Qwen2AudioInputs = Union[Qwen2AudioFeatureInputs, Qwen2AudioEmbeddingInputs]
+Qwen2AudioInputs: TypeAlias = Qwen2AudioFeatureInputs | Qwen2AudioEmbeddingInputs
 
 # === Audio Encoder === #
 
@@ -140,7 +140,7 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo):
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None}
 
 
@@ -157,7 +157,7 @@ class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingIn
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         feature_extractor = self.info.get_feature_extractor()
 
@@ -185,8 +185,8 @@ def _qwen2audio_field_config(hf_inputs: Mapping[str, torch.Tensor]):
 class Qwen2AudioMultiModalDataParser(MultiModalDataParser):
     def _parse_audio_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[AudioItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[AudioItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
@@ -314,7 +314,7 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor[Qwen2AudioProcessing
 )
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("audio"):
             return f"Audio {i}: <|audio_bos|><|AUDIO|><|audio_eos|>"
 
@@ -358,7 +358,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, Supports
 
     def _parse_and_validate_audio_input(
         self, **kwargs: object
-    ) -> Optional[Qwen2AudioInputs]:
+    ) -> Qwen2AudioInputs | None:
         input_features = kwargs.pop("input_features", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
         feature_attention_mask = kwargs.pop("feature_attention_mask", None)
@@ -395,7 +395,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, Supports
 
     def _process_audio_input(
         self, audio_input: Qwen2AudioInputs
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         if audio_input["type"] == "audio_embeds":
             audio_embeds = audio_input["audio_embeds"]
             return tuple(audio_embeds)
@@ -471,10 +471,10 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, Supports
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -486,7 +486,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, Supports
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 7251e7b2eea49..c03bd6a3c6d74 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -27,7 +27,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -77,9 +77,9 @@ class Qwen2MoeMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
-        expert_gate: Optional[torch.nn.Linear] = None,
+        expert_gate: torch.nn.Linear | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -120,7 +120,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
     def __init__(
         self,
         config: Qwen2MoeConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -195,12 +195,12 @@ class Qwen2MoeAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        dual_chunk_attention_config: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -285,8 +285,8 @@ class Qwen2MoeDecoderLayer(nn.Module):
     def __init__(
         self,
         config: Qwen2MoeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -339,7 +339,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> torch.Tensor:
         # Self Attention
         if residual is None:
@@ -396,9 +396,9 @@ class Qwen2MoeModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -548,12 +548,7 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
             getattr(config, "mlp_only_layers", [])
             or config.shared_expert_intermediate_size > 0
         ):
-            self.packed_modules_mapping["gate_up_proj"] = (
-                [
-                    "gate_proj",
-                    "up_proj",
-                ],
-            )
+            self.packed_modules_mapping["gate_up_proj"] = ["gate_proj", "up_proj"]
 
         self.model = Qwen2MoeModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
@@ -578,9 +573,9 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -589,7 +584,7 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 75ed95477f78f..c2f2ba637f090 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -8,7 +8,6 @@
 """Inference-only Qwen2-RM model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -83,9 +82,9 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index cb1bf3825c74f..8069039b0c560 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -25,9 +25,9 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
-from typing import Annotated, Any, Callable, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
@@ -167,7 +167,7 @@ class Qwen2VLImageEmbeddingInputs(TensorSchema):
     ]
 
 
-Qwen2VLImageInputs = Union[Qwen2VLImagePixelInputs, Qwen2VLImageEmbeddingInputs]
+Qwen2VLImageInputs: TypeAlias = Qwen2VLImagePixelInputs | Qwen2VLImageEmbeddingInputs
 
 
 class Qwen2VLVideoPixelInputs(TensorSchema):
@@ -228,7 +228,7 @@ class Qwen2VLVideoEmbeddingInputs(TensorSchema):
     ]
 
 
-Qwen2VLVideoInputs = Union[Qwen2VLVideoPixelInputs, Qwen2VLVideoEmbeddingInputs]
+Qwen2VLVideoInputs: TypeAlias = Qwen2VLVideoPixelInputs | Qwen2VLVideoEmbeddingInputs
 
 # === Vision Encoder === #
 
@@ -239,7 +239,7 @@ class Qwen2VisionMLP(nn.Module):
         in_features: int,
         hidden_features: int,
         act_layer: type[nn.Module] = QuickGELU,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -317,7 +317,7 @@ class Qwen2VisionAttention(nn.Module):
         embed_dim: int,
         num_heads: int,
         projection_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -413,8 +413,8 @@ class Qwen2VisionAttention(nn.Module):
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-        seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, 3 * head * head_dim]
         x, _ = self.qkv(x)
@@ -493,8 +493,8 @@ class Qwen2VisionBlock(nn.Module):
         num_heads: int,
         mlp_ratio: float,
         act_layer: type[nn.Module] = QuickGELU,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -527,8 +527,8 @@ class Qwen2VisionBlock(nn.Module):
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-        seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -576,9 +576,9 @@ class Qwen2VisionPatchMerger(nn.Module):
         self,
         d_model: int,
         context_dim: int,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
         spatial_merge_size: int = 2,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -659,7 +659,7 @@ class Qwen2VisionTransformer(nn.Module):
         self,
         vision_config: Qwen2VLVisionConfig,
         norm_eps: float = 1e-6,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -766,7 +766,7 @@ class Qwen2VisionTransformer(nn.Module):
 
     def compute_attn_mask_seqlen(
         self, cu_seqlens: torch.Tensor
-    ) -> tuple[Optional[int], Optional[list[int]]]:
+    ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
         if (
             self.attn_backend == _Backend.FLASH_ATTN
@@ -889,8 +889,8 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
 
     def _parse_image_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
@@ -903,8 +903,8 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
 
     def _parse_video_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[VideoItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             return DictEmbeddingItems(
                 data,
@@ -930,7 +930,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
     def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessor:
         return self.get_hf_processor(**kwargs).image_processor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": None}
 
     def get_mm_max_tokens_per_item(
@@ -949,7 +949,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         image_height: int,
         num_frames: int = 1,
         do_resize: bool = True,
-        image_processor: Optional[Qwen2VLImageProcessor],
+        image_processor: Qwen2VLImageProcessor | None,
     ) -> tuple[ImageSize, int]:
         if image_processor is None:
             image_processor = self.get_image_processor()
@@ -990,7 +990,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        image_processor: Optional[Qwen2VLImageProcessor],
+        image_processor: Qwen2VLImageProcessor | None,
     ) -> int:
         _, num_image_tokens = self._get_vision_info(
             image_width=image_width,
@@ -1006,7 +1006,7 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
         image_width: int,
         image_height: int,
         num_frames: int,
-        image_processor: Optional[Qwen2VLImageProcessor],
+        image_processor: Qwen2VLImageProcessor | None,
     ) -> int:
         _, num_video_tokens = self._get_vision_info(
             image_width=image_width,
@@ -1100,7 +1100,7 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -1207,12 +1207,12 @@ class Qwen2VLForConditionalGeneration(
         self,
         input_tokens: list[int],
         hf_config: PretrainedConfig,
-        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        second_per_grid_ts: Optional[list[float]] = None,
+        image_grid_thw: list[list[int]] | torch.Tensor | None,
+        video_grid_thw: list[list[int]] | torch.Tensor | None,
+        second_per_grid_ts: list[float] | None = None,
         context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
+        seq_len: int | None = None,
+        audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
         """Get M-RoPE input positions for Qwen2-VL model."""
@@ -1335,7 +1335,7 @@ class Qwen2VLForConditionalGeneration(
         return llm_positions, mrope_position_delta
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|image_pad|><|vision_end|>"
         if modality.startswith("video"):
@@ -1396,7 +1396,7 @@ class Qwen2VLForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Qwen2VLImageInputs]:
+    ) -> Qwen2VLImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -1434,7 +1434,7 @@ class Qwen2VLForConditionalGeneration(
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[Qwen2VLVideoInputs]:
+    ) -> Qwen2VLVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -1574,10 +1574,10 @@ class Qwen2VLForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for Qwen2-VL.
 
         Args:
@@ -1606,7 +1606,7 @@ class Qwen2VLForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
@@ -1634,7 +1634,7 @@ class Tarsier2MultiModalProcessor(Qwen2VLMultiModalProcessor):
 class Tarsier2ImageProcessor(Qwen2VLImageProcessor):
     def __init__(
         self,
-        size: Optional[dict[str, int]] = None,
+        size: dict[str, int] | None = None,
         **kwargs,
     ) -> None:
         if size is not None and "min_pixels" in size and "max_pixels" in size:
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index bcd4968ba5c46..563d3cc23d726 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -24,7 +24,7 @@
 """Inference-only Qwen3 model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -58,16 +58,16 @@ class Qwen3Attention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         rms_norm_eps: float = 1e-06,
         qkv_bias: bool = False,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        rope_scaling: Optional[tuple] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
-        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        dual_chunk_attention_config: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -160,8 +160,8 @@ class Qwen3DecoderLayer(nn.Module):
     def __init__(
         self,
         config: Qwen3Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -214,7 +214,7 @@ class Qwen3DecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -315,9 +315,9 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -326,7 +326,7 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 0769378933d52..8452d7b04f5c2 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -26,7 +26,7 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -84,7 +84,7 @@ class Qwen3MoeMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         reduce_results: bool = True,
         prefix: str = "",
     ) -> None:
@@ -215,15 +215,15 @@ class Qwen3MoeAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        head_dim: Optional[int] = None,
+        head_dim: int | None = None,
         rms_norm_eps: float = 1e-06,
         qkv_bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
-        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        dual_chunk_attention_config: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -374,7 +374,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -432,11 +432,9 @@ class Qwen3MoeModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[
-        torch.Tensor, IntermediateTensors, tuple[torch.Tensor, list[torch.Tensor]]
-    ]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -647,12 +645,7 @@ class Qwen3MoeForCausalLM(
         self.quant_config = quant_config
         # Only perform the following mapping when Qwen3MoeMLP exists
         if getattr(config, "mlp_only_layers", []):
-            self.packed_modules_mapping["gate_up_proj"] = (
-                [
-                    "gate_proj",
-                    "up_proj",
-                ],
-            )
+            self.packed_modules_mapping["gate_up_proj"] = ["gate_proj", "up_proj"]
         self.model = Qwen3MoeModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
@@ -742,9 +735,9 @@ class Qwen3MoeForCausalLM(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -753,7 +746,7 @@ class Qwen3MoeForCausalLM(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 50629bb2e4a26..a29def57c4a08 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -4,7 +4,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional
 
 import torch
 from einops import rearrange
@@ -233,10 +232,10 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
     def __init__(
         self,
         config: Qwen3NextConfig,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        speculative_config: Optional[SpeculativeConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        speculative_config: SpeculativeConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -661,9 +660,9 @@ class Qwen3NextAttention(nn.Module):
     def __init__(
         self,
         config: Qwen3NextConfig,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -863,7 +862,7 @@ class Qwen3NextDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
         positions: torch.Tensor = None,
         **kwargs: object,
     ):
@@ -971,8 +970,8 @@ class Qwen3NextModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -1207,8 +1206,8 @@ class Qwen3NextForCausalLM(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ):
         hidden_states = self.model(
@@ -1251,7 +1250,7 @@ class Qwen3NextForCausalLM(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.logits_processor(self.lm_head, hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/qwen3_next_mtp.py b/vllm/model_executor/models/qwen3_next_mtp.py
index 828931716c8f9..a447484ae82a0 100644
--- a/vllm/model_executor/models/qwen3_next_mtp.py
+++ b/vllm/model_executor/models/qwen3_next_mtp.py
@@ -3,7 +3,6 @@
 """Inference-only Qwen3Next MTP model."""
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from torch import nn
@@ -108,8 +107,8 @@ class Qwen3NextMultiTokenPredictor(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
@@ -275,8 +274,8 @@ class Qwen3NextMTP(nn.Module, SupportsPP):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ):
         hidden_states = self.model(
@@ -288,7 +287,7 @@ class Qwen3NextMTP(nn.Module, SupportsPP):
         self,
         hidden_states: torch.Tensor,
         spec_step_idx: int = 0,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.logits_processor(self.lm_head, hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 6eb9faabd1c7f..b1eceaa6ef41d 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -22,9 +22,9 @@
 # limitations under the License.
 """Inference-only Qwen3-Omni-Moe model (thinker part)."""
 
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import numpy as np
 import torch
@@ -156,7 +156,7 @@ class Qwen3_VisionMLP(nn.Module):
         hidden_features: int,
         bias: bool = False,
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -190,8 +190,8 @@ class Qwen3_VisionBlock(nn.Module):
         num_heads: int,
         mlp_hidden_dim: int,
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -220,8 +220,8 @@ class Qwen3_VisionBlock(nn.Module):
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-        seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -240,10 +240,10 @@ class Qwen3_VisionPatchMerger(nn.Module):
         self,
         d_model: int,
         context_dim: int,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
         spatial_merge_size: int = 2,
         use_postshuffle_norm: bool = False,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -297,7 +297,7 @@ class Qwen3Omni_VisionTransformer(nn.Module):
         self,
         vision_config,
         norm_eps: float = 1e-6,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -482,7 +482,7 @@ class Qwen3Omni_VisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[Optional[int], Optional[list[int]]]:
+    ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
         if self.attn_backend == _Backend.FLASH_ATTN:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
@@ -595,10 +595,10 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        deepstack_input_embeds: Optional[IntermediateTensors] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        deepstack_input_embeds: IntermediateTensors | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -683,7 +683,7 @@ class Qwen3OmniMoeThinkerProcessingInfo(
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None, "image": None, "video": None}
 
 
@@ -818,7 +818,7 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
         self,
         thinker_config: PretrainedConfig,
         audio_len: int,
-        video_grid_thw: Union[list[int], torch.Tensor],
+        video_grid_thw: list[int] | torch.Tensor,
         video_second_per_grid_t: float,
     ) -> list[int]:
         shift = 0
@@ -1100,7 +1100,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|image_pad|><|vision_end|>"
         if modality.startswith("video"):
@@ -1246,7 +1246,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
 
     def get_multimodal_embeddings(
         self, **kwargs: object
-    ) -> Optional[MultiModalEmbeddings]:
+    ) -> MultiModalEmbeddings | None:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
@@ -1273,9 +1273,9 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         inputs_embeds = self._get_text_embeddings(
@@ -1356,10 +1356,10 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -1391,7 +1391,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
@@ -1408,12 +1408,12 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
         self,
         input_tokens: list[int],
         hf_config: PretrainedConfig,
-        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
-        second_per_grid_ts: Optional[list[float]] = None,
+        image_grid_thw: list[list[int]] | torch.Tensor | None,
+        video_grid_thw: list[list[int]] | torch.Tensor | None,
+        second_per_grid_ts: list[float] | None = None,
         context_len: int = 0,
-        seq_len: Optional[int] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
+        seq_len: int | None = None,
+        audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
         config = hf_config.thinker_config
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 6a7d2eaeab3b8..0f706ab55a07a 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -24,10 +24,10 @@
 # limitations under the License.
 """Inference-only Qwen3VL model compatible with HuggingFace weights."""
 
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from itertools import islice
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import numpy as np
 import torch
@@ -151,7 +151,7 @@ class Qwen3_VisionMLP(nn.Module):
         hidden_features: int,
         bias: bool = False,
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -188,8 +188,8 @@ class Qwen3_VisionBlock(nn.Module):
         num_heads: int,
         mlp_hidden_dim: int,
         act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend: _Backend = _Backend.TORCH_SDPA,
@@ -225,8 +225,8 @@ class Qwen3_VisionBlock(nn.Module):
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
-        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
-        seqlens: Optional[list[int]] = None,  # Only used for xFormers
+        max_seqlen: int | None = None,  # Only used for Flash Attention
+        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
@@ -245,10 +245,10 @@ class Qwen3_VisionPatchMerger(nn.Module):
         self,
         d_model: int,
         context_dim: int,
-        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        norm_layer: Callable[[int], nn.Module] | None = None,
         spatial_merge_size: int = 2,
         use_postshuffle_norm: bool = False,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -297,7 +297,7 @@ class Qwen3_VisionTransformer(nn.Module):
         self,
         vision_config: Qwen3VLVisionConfig,
         norm_eps: float = 1e-6,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ) -> None:
@@ -511,7 +511,7 @@ class Qwen3_VisionTransformer(nn.Module):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[Optional[int], Optional[list[int]]]:
+    ) -> tuple[int | None, list[int] | None]:
         max_seqlen, seqlens = None, None
         if (
             self.attn_backend == _Backend.FLASH_ATTN
@@ -625,9 +625,7 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
         image_height: int,
         num_frames: int = 2,
         do_resize: bool = True,
-        image_processor: Optional[
-            Union[Qwen2VLImageProcessorFast, Qwen3VLVideoProcessor]
-        ],
+        image_processor: Qwen2VLImageProcessorFast | Qwen3VLVideoProcessor | None,
     ) -> tuple[ImageSize, int]:
         if image_processor is None and num_frames > 1:
             image_processor = self.get_video_processor()
@@ -726,8 +724,8 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
         self,
         metadata: dict[str, Any],
         out_item: MultiModalKwargsItem,
-        do_sample_frames: Optional[bool] = None,
-        sampled_fps: Optional[float] = None,
+        do_sample_frames: bool | None = None,
+        sampled_fps: float | None = None,
     ) -> list[int]:
         video_processor = self.get_video_processor()
         merge_size = video_processor.merge_size
@@ -778,7 +776,7 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
         num_videos = mm_counts.get("video", 0)
@@ -1096,11 +1094,11 @@ class Qwen3LLMModel(Qwen3Model):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         # args for deepstack
-        deepstack_input_embeds: Optional[IntermediateTensors] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        deepstack_input_embeds: IntermediateTensors | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -1201,7 +1199,7 @@ class Qwen3VLForConditionalGeneration(
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<|vision_start|><|image_pad|><|vision_end|>"
         if modality.startswith("video"):
@@ -1314,7 +1312,7 @@ class Qwen3VLForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Qwen2_5_VLImageInputs]:
+    ) -> Qwen2_5_VLImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
         image_grid_thw = kwargs.pop("image_grid_thw", None)
@@ -1363,7 +1361,7 @@ class Qwen3VLForConditionalGeneration(
 
     def _parse_and_validate_video_input(
         self, **kwargs: object
-    ) -> Optional[Qwen2_5_VLVideoInputs]:
+    ) -> Qwen2_5_VLVideoInputs | None:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
         video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
@@ -1486,12 +1484,12 @@ class Qwen3VLForConditionalGeneration(
         cls,
         input_tokens: list[int],
         hf_config: PretrainedConfig,
-        image_grid_thw: Union[list[list[int]], torch.Tensor],
-        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        image_grid_thw: list[list[int]] | torch.Tensor,
+        video_grid_thw: list[list[int]] | torch.Tensor,
         context_len: int = 0,
-        seq_len: Optional[int] = None,
-        second_per_grid_ts: Optional[list[float]] = None,
-        audio_feature_lengths: Optional[torch.Tensor] = None,
+        seq_len: int | None = None,
+        second_per_grid_ts: list[float] | None = None,
+        audio_feature_lengths: torch.Tensor | None = None,
         use_audio_in_video: bool = False,
     ) -> tuple[torch.Tensor, int]:
         """Get mrope input positions and delta value."""
@@ -1596,7 +1594,7 @@ class Qwen3VLForConditionalGeneration(
 
     def get_multimodal_embeddings(
         self, **kwargs: object
-    ) -> Optional[MultiModalEmbeddings]:
+    ) -> MultiModalEmbeddings | None:
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return None
@@ -1661,9 +1659,9 @@ class Qwen3VLForConditionalGeneration(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         inputs_embeds = self._get_text_embeddings(
@@ -1710,10 +1708,10 @@ class Qwen3VLForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for Qwen3VL.
 
         Args:
@@ -1769,7 +1767,7 @@ class Qwen3VLForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index db7bcb0436595..21b2e395c77f3 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -25,9 +25,8 @@
 """Inference-only Qwen3-VL-MoE model compatible with HuggingFace weights."""
 
 import typing
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Callable, Optional, Union
 
 import torch
 from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
@@ -90,10 +89,10 @@ class Qwen3MoeLLMModel(Qwen3MoeModel):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        deepstack_input_embeds: Optional[IntermediateTensors] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        deepstack_input_embeds: IntermediateTensors | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 1786ea6a6878b..f011229985c87 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -9,9 +9,9 @@
 import copy
 import math
 import unicodedata
-from collections.abc import Collection, Mapping, Sequence, Set
+from collections.abc import Callable, Collection, Mapping, Sequence, Set
 from functools import lru_cache, partial
-from typing import Annotated, Callable, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import regex as re
 import torch
@@ -93,7 +93,7 @@ class QwenImageEmbeddingInputs(TensorSchema):
     data: Annotated[torch.Tensor, TensorShape("bn", 256, "hs")]
 
 
-QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs]
+QwenImageInputs: TypeAlias = QwenImagePixelInputs | QwenImageEmbeddingInputs
 
 
 class VisualAttention(nn.Module):
@@ -107,8 +107,8 @@ class VisualAttention(nn.Module):
         embed_dim: int,
         num_heads: int,
         bias: bool = True,
-        kdim: Optional[int] = None,
-        vdim: Optional[int] = None,
+        kdim: int | None = None,
+        vdim: int | None = None,
     ):
         super().__init__()
         self.embed_dim = embed_dim
@@ -135,7 +135,7 @@ class VisualAttention(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
+        attn_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         # query/key/value: [sq, b, h]
         sq, b, _ = x.size()
@@ -213,7 +213,7 @@ class QwenVLMLP(nn.Module):
         self,
         hidden_size: int,
         intermediate_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         self.c_fc = ColumnParallelLinear(
@@ -241,7 +241,7 @@ class VisualAttentionBlock(nn.Module):
         n_head: int,
         mlp_ratio: float = 4.0,
         norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
 
@@ -258,7 +258,7 @@ class VisualAttentionBlock(nn.Module):
     def attention(
         self,
         x: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
+        attn_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         attn_mask = attn_mask.to(x.dtype) if attn_mask is not None else None
         return self.attn(x, attn_mask=attn_mask)
@@ -266,7 +266,7 @@ class VisualAttentionBlock(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
+        attn_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         x = x + self.attention(self.ln_1(x), attn_mask=attn_mask)
         x = x + self.mlp(self.ln_2(x))
@@ -281,7 +281,7 @@ class TransformerBlock(nn.Module):
         heads: int,
         mlp_ratio: float = 4.0,
         norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
     ):
         super().__init__()
         self.width = width
@@ -307,7 +307,7 @@ class TransformerBlock(nn.Module):
         return self.resblocks[0].mlp.c_fc.weight.device
 
     def forward(
-        self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None
+        self, x: torch.Tensor, attn_mask: torch.Tensor | None = None
     ) -> torch.Tensor:
         for r in self.resblocks:
             x = r(x, attn_mask=attn_mask)
@@ -326,7 +326,7 @@ class VisionTransformer(nn.Module):
         n_queries: int = 256,
         output_dim: int = 512,
         image_start_id: int = 151857,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         **kwargs,
     ):
         super().__init__()
@@ -434,10 +434,10 @@ def _get_tokenizer_without_image_pad(
         def tokenize(
             self,
             text: str,
-            allowed_special: Union[Set[str], str] = "all",
-            disallowed_special: Union[Collection[str], str] = (),
+            allowed_special: Set[str] | str = "all",
+            disallowed_special: Collection[str] | str = (),
             **kwargs,
-        ) -> list[Union[bytes, str]]:
+        ) -> list[bytes | str]:
             text = unicodedata.normalize("NFC", text)
 
             return [
@@ -451,9 +451,9 @@ def _get_tokenizer_without_image_pad(
 
         def _decode(
             self,
-            token_ids: Union[int, list[int]],
+            token_ids: int | list[int],
             skip_special_tokens: bool = False,
-            errors: Optional[str] = None,
+            errors: str | None = None,
             **kwargs,
         ) -> str:
             if isinstance(token_ids, int):
@@ -523,9 +523,9 @@ class QwenVLProcessor:
 
     def __call__(
         self,
-        text: Optional[Union[TextInput, list[TextInput]]] = None,
-        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: TextInput | list[TextInput] | None = None,
+        images: ImageInput | list[ImageInput] | None = None,
+        return_tensors: str | TensorType | None = None,
     ) -> BatchFeature:
         if text is None:
             text = []
@@ -568,7 +568,7 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
             **kwargs,
         )
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(self) -> int:
@@ -597,7 +597,7 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         hf_config = self.info.get_hf_config()
         vision_config = hf_config.visual
@@ -722,7 +722,7 @@ class QwenVLForConditionalGeneration(
         )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return f"Picture {i}: <img></img>"
 
@@ -745,7 +745,7 @@ class QwenVLForConditionalGeneration(
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[QwenImageInputs]:
+    ) -> QwenImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -799,10 +799,10 @@ class QwenVLForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
diff --git a/vllm/model_executor/models/radio.py b/vllm/model_executor/models/radio.py
index 2313b98348b77..6cda80f5ebe75 100644
--- a/vllm/model_executor/models/radio.py
+++ b/vllm/model_executor/models/radio.py
@@ -11,7 +11,7 @@
 import math
 from collections.abc import Iterable
 from itertools import repeat
-from typing import Optional, Union
+from typing import TypeAlias
 
 import torch
 import torch.nn as nn
@@ -23,8 +23,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.intern_vit import InternVisionEncoder
 
-input_dim_t = Union[int, tuple[int, int]]
-norm_t = Union[tuple[float, float, float], torch.Tensor]
+input_dim_t: TypeAlias = int | tuple[int, int]
+norm_t: TypeAlias = tuple[float, float, float] | torch.Tensor
 
 
 def _ntuple(n):
@@ -75,8 +75,8 @@ class ClsToken(nn.Module):
         ndim: int,
         num_tokens: int = 1,
         enabled: bool = True,
-        register_multiple: Optional[int] = None,
-        num_registers: Optional[int] = None,
+        register_multiple: int | None = None,
+        num_registers: int | None = None,
     ):
         super().__init__()
 
@@ -128,12 +128,12 @@ class ViTPatchGenerator(nn.Module):
         abs_pos: bool = True,
         normalize_patches: bool = False,
         cls_token: bool = False,
-        max_input_dims: Optional[input_dim_t] = None,
+        max_input_dims: input_dim_t | None = None,
         pos_dropout: float = 0.0,
         return_pos_enc: bool = False,
         num_cls_tokens: int = 1,
-        register_multiple: Optional[int] = None,
-        num_registers: Optional[int] = None,
+        register_multiple: int | None = None,
+        num_registers: int | None = None,
         patch_bias: bool = False,
         device=None,
         dtype=None,
@@ -275,8 +275,8 @@ class ViTPatchGenerator(nn.Module):
     def apply_pos_enc(
         self,
         patches: torch.Tensor,
-        patch_idxs: Optional[torch.Tensor] = None,
-        input_size: Optional[tuple[int, int]] = None,
+        patch_idxs: torch.Tensor | None = None,
+        input_size: tuple[int, int] | None = None,
     ) -> torch.Tensor:
         if not self.abs_pos:
             return patches
@@ -299,8 +299,8 @@ class ViTPatchGenerator(nn.Module):
     def get_pos_enc(
         self,
         batch_size: int,
-        patch_idxs: Optional[torch.Tensor] = None,
-        input_size: Optional[tuple[int, int]] = None,
+        patch_idxs: torch.Tensor | None = None,
+        input_size: tuple[int, int] | None = None,
     ) -> torch.Tensor:
         if input_size is None:
             input_dims = self.input_dims
@@ -440,9 +440,9 @@ class RadioInternVisionModel(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
     ) -> None:
@@ -472,7 +472,7 @@ class RadioInternVisionModel(nn.Module):
             prefix=f"{prefix}.encoder",
         )
 
-    def _init_img_size(self, patch_size, img_size: Union[int, tuple[int, int]]):
+    def _init_img_size(self, patch_size, img_size: int | tuple[int, int]):
         if img_size is None:
             return None, None, None
         img_size = to_2tuple(img_size)
@@ -498,9 +498,9 @@ class RadioModel(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
+        num_hidden_layers_override: int | None = None,
         num_dummy_heads: int = 0,
         prefix: str = "",
     ) -> None:
@@ -522,8 +522,8 @@ class RadioModel(nn.Module):
 
     def forward(
         self,
-        pixel_values: Optional[torch.Tensor] = None,
-        pixel_embeds: Optional[torch.Tensor] = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_embeds: torch.Tensor | None = None,
     ) -> torch.FloatTensor:
         x = self.input_conditioner(pixel_values)
         y = self.model(x)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a52fcb3eeef3c..194d2593a7fe5 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -14,11 +14,11 @@ import subprocess
 import sys
 import tempfile
 from abc import ABC, abstractmethod
-from collections.abc import Set
+from collections.abc import Callable, Set
 from dataclasses import asdict, dataclass, field
 from functools import lru_cache
 from pathlib import Path
-from typing import Callable, Optional, TypeVar, Union
+from typing import TypeVar
 
 import torch.nn as nn
 import transformers
@@ -654,7 +654,7 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
 def _try_load_model_cls(
     model_arch: str,
     model: _BaseRegisteredModel,
-) -> Optional[type[nn.Module]]:
+) -> type[nn.Module] | None:
     from vllm.platforms import current_platform
 
     current_platform.verify_model_arch(model_arch)
@@ -669,7 +669,7 @@ def _try_load_model_cls(
 def _try_inspect_model_cls(
     model_arch: str,
     model: _BaseRegisteredModel,
-) -> Optional[_ModelInfo]:
+) -> _ModelInfo | None:
     try:
         return model.inspect_model_cls()
     except Exception:
@@ -688,7 +688,7 @@ class _ModelRegistry:
     def register_model(
         self,
         model_arch: str,
-        model_cls: Union[type[nn.Module], str],
+        model_cls: type[nn.Module] | str,
     ) -> None:
         """
         Register an external model to be used in vLLM.
@@ -756,13 +756,13 @@ class _ModelRegistry:
             f"Supported architectures: {all_supported_archs}"
         )
 
-    def _try_load_model_cls(self, model_arch: str) -> Optional[type[nn.Module]]:
+    def _try_load_model_cls(self, model_arch: str) -> type[nn.Module] | None:
         if model_arch not in self.models:
             return None
 
         return _try_load_model_cls(model_arch, self.models[model_arch])
 
-    def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]:
+    def _try_inspect_model_cls(self, model_arch: str) -> _ModelInfo | None:
         if model_arch not in self.models:
             return None
 
@@ -772,7 +772,7 @@ class _ModelRegistry:
         self,
         architecture: str,
         model_config: ModelConfig,
-    ) -> Optional[str]:
+    ) -> str | None:
         if architecture in _TRANSFORMERS_BACKEND_MODELS:
             return architecture
 
@@ -862,7 +862,7 @@ class _ModelRegistry:
 
     def inspect_model_cls(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> tuple[_ModelInfo, str]:
         if isinstance(architectures, str):
@@ -914,7 +914,7 @@ class _ModelRegistry:
 
     def resolve_model_cls(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> tuple[type[nn.Module], str]:
         if isinstance(architectures, str):
@@ -968,7 +968,7 @@ class _ModelRegistry:
 
     def is_text_generation_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -976,7 +976,7 @@ class _ModelRegistry:
 
     def is_pooling_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -984,7 +984,7 @@ class _ModelRegistry:
 
     def is_cross_encoder_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -992,7 +992,7 @@ class _ModelRegistry:
 
     def is_multimodal_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -1000,7 +1000,7 @@ class _ModelRegistry:
 
     def is_multimodal_raw_input_only_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -1008,7 +1008,7 @@ class _ModelRegistry:
 
     def is_pp_supported_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -1016,7 +1016,7 @@ class _ModelRegistry:
 
     def model_has_inner_state(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -1024,7 +1024,7 @@ class _ModelRegistry:
 
     def is_attention_free_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -1032,7 +1032,7 @@ class _ModelRegistry:
 
     def is_hybrid_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -1040,7 +1040,7 @@ class _ModelRegistry:
 
     def is_noops_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -1048,7 +1048,7 @@ class _ModelRegistry:
 
     def is_transcription_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -1056,7 +1056,7 @@ class _ModelRegistry:
 
     def is_transcription_only_model(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
@@ -1064,7 +1064,7 @@ class _ModelRegistry:
 
     def is_v1_compatible(
         self,
-        architectures: Union[str, list[str]],
+        architectures: str | list[str],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 6408cf7937b2f..456226360b91b 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -68,7 +67,7 @@ class RobertaEmbedding(nn.Module):
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         token_type_ids = _decode_token_type_ids(input_ids)
 
@@ -124,8 +123,8 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         # Fix Roberta positions here outside of the CUDA graph.
         # Because we need the to extract the sequences from
@@ -143,7 +142,7 @@ class RobertaEmbeddingModel(BertEmbeddingModel):
 
     def _build_model(
         self, vllm_config: VllmConfig, prefix: str = ""
-    ) -> Union[BertModel, BertWithRope]:
+    ) -> BertModel | BertWithRope:
         if vllm_config.model_config.hf_config.position_embedding_type == "rotary":
             return JinaRobertaModel(vllm_config=vllm_config, prefix=prefix)
         else:
@@ -240,11 +239,11 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        token_type_ids: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        token_type_ids: torch.Tensor | None = None,
     ) -> torch.Tensor:
         replace_roberta_positions(
             input_ids=input_ids, position_ids=positions, padding_idx=self.padding_idx
diff --git a/vllm/model_executor/models/rvl.py b/vllm/model_executor/models/rvl.py
index 89150677f3ce8..92352febe87ec 100644
--- a/vllm/model_executor/models/rvl.py
+++ b/vllm/model_executor/models/rvl.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Mapping
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -41,7 +40,7 @@ class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index ca33a694a3b61..641160295afb3 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -25,7 +25,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -75,7 +74,7 @@ class SeedOssMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -115,9 +114,9 @@ class SeedOssAttention(nn.Module):
         head_dim: int,
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        rope_scaling: Optional[tuple] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -195,8 +194,8 @@ class SeedOssDecoderLayer(nn.Module):
     def __init__(
         self,
         config: SeedOssConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -243,7 +242,7 @@ class SeedOssDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -342,9 +341,9 @@ class SeedOssModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -477,9 +476,9 @@ class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -488,7 +487,7 @@ class SeedOssForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index ee21a03c8525d..b79dc31cfe3d4 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -5,7 +5,6 @@ within a vision language model."""
 
 import math
 from collections.abc import Iterable
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -153,7 +152,7 @@ class SiglipAttention(nn.Module):
     def __init__(
         self,
         config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -211,7 +210,7 @@ class SiglipMLP(nn.Module):
     def __init__(
         self,
         config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -251,7 +250,7 @@ class SiglipEncoderLayer(nn.Module):
     def __init__(
         self,
         config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -293,8 +292,8 @@ class SiglipEncoder(nn.Module):
     def __init__(
         self,
         config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        num_hidden_layers_override: Optional[int] = None,
+        quant_config: QuantizationConfig | None = None,
+        num_hidden_layers_override: int | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -321,7 +320,7 @@ class SiglipEncoder(nn.Module):
         self,
         inputs_embeds: torch.Tensor,
         return_all_hidden_states: bool,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         hidden_states_pool = [inputs_embeds]
         hidden_states = inputs_embeds
 
@@ -342,7 +341,7 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
     def __init__(
         self,
         config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -375,10 +374,10 @@ class SiglipVisionTransformer(nn.Module):
     def __init__(
         self,
         config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -426,8 +425,8 @@ class SiglipVisionTransformer(nn.Module):
         pixel_values: torch.Tensor,
         *,
         interpolate_pos_encoding: bool = False,
-        select_layers: Optional[list[int]] = None,
-        feature_select_strategy: Optional[VisionFeatureSelectStrategy] = None,
+        select_layers: list[int] | None = None,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
     ) -> torch.Tensor:
         hidden_states = self.embeddings(
             pixel_values,
@@ -464,10 +463,10 @@ class SiglipVisionModel(nn.Module):
     def __init__(
         self,
         config: SiglipVisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         *,
-        num_hidden_layers_override: Optional[int] = None,
-        require_post_norm: Optional[bool] = None,
+        num_hidden_layers_override: int | None = None,
+        require_post_norm: bool | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -491,8 +490,8 @@ class SiglipVisionModel(nn.Module):
         self,
         pixel_values: torch.Tensor,
         interpolate_pos_encoding: bool = False,
-        select_layers: Optional[list[int]] = None,
-        feature_select_strategy: Optional[VisionFeatureSelectStrategy] = None,
+        select_layers: list[int] | None = None,
+        feature_select_strategy: VisionFeatureSelectStrategy | None = None,
     ) -> torch.Tensor:
         return self.vision_model(
             pixel_values=pixel_values,
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 81f7e9887acee..e7af0e7a7ae41 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -4,7 +4,6 @@
 within a vision language model."""
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 from einops import rearrange, repeat
@@ -82,7 +81,7 @@ class Siglip2VisionEmbeddings(nn.Module):
     def forward(
         self,
         pixel_values: torch.FloatTensor,
-        grid_thws: Optional[torch.LongTensor] = None,
+        grid_thws: torch.LongTensor | None = None,
     ) -> torch.Tensor:
         """
         Args:
@@ -206,7 +205,7 @@ class Siglip2Attention(nn.Module):
     def __init__(
         self,
         config: Siglip2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -275,8 +274,8 @@ class Siglip2Attention(nn.Module):
         self,
         hidden_states: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """Input shape: Batch x Time x Channel"""
 
         seq_length, embed_dim = hidden_states.shape
@@ -337,7 +336,7 @@ class Siglip2MLP(nn.Module):
     def __init__(
         self,
         config: Siglip2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -370,7 +369,7 @@ class Siglip2EncoderLayer(nn.Module):
     def __init__(
         self,
         config: Siglip2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -432,7 +431,7 @@ class Siglip2Encoder(nn.Module):
     def __init__(
         self,
         config: Siglip2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -616,7 +615,7 @@ class Siglip2VisionTransformer(nn.Module):
     def __init__(
         self,
         config: Siglip2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -655,7 +654,7 @@ class Siglip2NavitModel(torch.nn.Module):
     def __init__(
         self,
         config: Siglip2VisionConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index f0f6917ddf913..50d98d1d5bdd7 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -8,7 +8,7 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Literal, Optional, Union
+from typing import Annotated, Literal, TypeAlias
 
 import torch
 import torch.nn as nn
@@ -96,14 +96,14 @@ class SkyworkR1VImageEmbeddingInputs(TensorSchema):
     type: Literal["image_embeds"] = "image_embeds"
 
     data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]],
+        torch.Tensor | list[torch.Tensor],
         TensorShape("ni", "ifs", "hs"),
     ]
 
 
-SkyworkR1VImageInputs = Union[
-    SkyworkR1VImagePixelInputs, SkyworkR1VImageEmbeddingInputs
-]
+SkyworkR1VImageInputs: TypeAlias = (
+    SkyworkR1VImagePixelInputs | SkyworkR1VImageEmbeddingInputs
+)
 
 
 # adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
@@ -284,9 +284,9 @@ class SkyworkR1VProcessor:
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> None:
         super().__init__()
 
@@ -324,7 +324,7 @@ class SkyworkR1VProcessor:
     def get_image_repl(
         self,
         feature_size: int,
-        num_patches: Optional[int],
+        num_patches: int | None,
     ) -> PromptUpdateDetails[str]:
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
@@ -334,10 +334,10 @@ class SkyworkR1VProcessor:
     def resolve_min_max_num(
         self,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_thumbnail: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
     ) -> tuple[int, int]:
         min_dynamic_patch = (
             self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
@@ -362,10 +362,10 @@ class SkyworkR1VProcessor:
     def resolve_target_ratios(
         self,
         *,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        use_thumbnail: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        use_thumbnail: bool | None = None,
     ) -> list[tuple[int, int]]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -399,9 +399,9 @@ class SkyworkR1VProcessor:
     def _images_to_pixel_values_lst(
         self,
         images: list[Image.Image],
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
     ) -> list[torch.Tensor]:
         min_num, max_num = self.resolve_min_max_num(
             min_dynamic_patch=min_dynamic_patch,
@@ -423,12 +423,12 @@ class SkyworkR1VProcessor:
 
     def __call__(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        min_dynamic_patch: Optional[int] = None,
-        max_dynamic_patch: Optional[int] = None,
-        dynamic_image_size: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        min_dynamic_patch: int | None = None,
+        max_dynamic_patch: int | None = None,
+        dynamic_image_size: bool | None = None,
+        return_tensors: str | TensorType | None = None,
     ) -> BatchFeature:
         if text is None:
             text = []
@@ -479,7 +479,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
             **kwargs,
         )
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -487,7 +487,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[SkyworkR1VProcessor],
+        processor: SkyworkR1VProcessor | None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
@@ -532,7 +532,7 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[SkyworkR1VProcessingIn
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         target_width, target_height = self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
@@ -650,7 +650,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
     merge_by_field_config = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -715,7 +715,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
     def _init_vision_model(
         self,
         config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig],
+        quant_config: QuantizationConfig | None,
         *,
         is_mono: bool,
         prefix: str,
@@ -784,7 +784,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[SkyworkR1VImageInputs]:
+    ) -> SkyworkR1VImageInputs | None:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
         image_embeds = kwargs.pop("image_embeds", None)
@@ -818,7 +818,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
     def _process_image_input(
         self,
         image_input: SkyworkR1VImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | list[torch.Tensor] | tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -864,9 +864,9 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         if multimodal_embeddings is not None and len(multimodal_embeddings) > 0:
@@ -887,8 +887,8 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ) -> IntermediateTensors:
         if intermediate_tensors is not None:
@@ -913,7 +913,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py
index 1800330c8235f..e8b805297d963 100644
--- a/vllm/model_executor/models/smolvlm.py
+++ b/vllm/model_executor/models/smolvlm.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 from transformers import SmolVLMProcessor
 
@@ -17,9 +16,7 @@ class SmolVLMProcessingInfo(Idefics3ProcessingInfo):
     def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor:
         return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs)
 
-    def _get_image_token(
-        self, processor: Optional[SmolVLMProcessor]
-    ) -> tuple[str, str]:
+    def _get_image_token(self, processor: SmolVLMProcessor | None) -> tuple[str, str]:
         if processor is None:
             processor = self.get_hf_processor()
         image_token = processor.image_token
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 5abcb47c6e25f..f0dfce7bc7b64 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -25,7 +25,7 @@
 """Inference-only Solar model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -73,7 +73,7 @@ class SolarMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
         prefix: str = "",
     ) -> None:
@@ -113,11 +113,11 @@ class SolarAttention(nn.Module):
         num_heads: int,
         num_kv_heads: int,
         rope_theta: float = 10000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         bias: bool = False,
-        cache_config: Optional[CacheConfig] = None,
+        cache_config: CacheConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -197,8 +197,8 @@ class SolarDecoderLayer(nn.Module):
     def __init__(
         self,
         config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -250,7 +250,7 @@ class SolarDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         if residual is None:
@@ -322,11 +322,11 @@ class SolarModel(nn.Module):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -501,9 +501,9 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         model_output = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 79ed001833444..a4e309e0aa6ba 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -24,7 +24,6 @@ model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -63,7 +62,7 @@ class StablelmMLP(nn.Module):
     def __init__(
         self,
         config: StableLmConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -97,8 +96,8 @@ class StablelmAttention(nn.Module):
     def __init__(
         self,
         config: StableLmConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -184,8 +183,8 @@ class StablelmDecoderLayer(nn.Module):
     def __init__(
         self,
         config: StableLmConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -254,9 +253,9 @@ class StableLMEpochModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -340,9 +339,9 @@ class StablelmForCausalLM(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -351,7 +350,7 @@ class StablelmForCausalLM(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index ec894140c3bf3..d147237808c2a 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -23,7 +23,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Optional, Union
 
 import torch
 from torch import nn
@@ -67,8 +66,8 @@ class Starcoder2Attention(nn.Module):
     def __init__(
         self,
         config: Starcoder2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -147,7 +146,7 @@ class Starcoder2MLP(nn.Module):
     def __init__(
         self,
         config: Starcoder2Config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -178,8 +177,8 @@ class Starcoder2DecoderLayer(nn.Module):
     def __init__(
         self,
         config: Starcoder2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -258,9 +257,9 @@ class Starcoder2Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
@@ -347,9 +346,9 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
         )
@@ -358,7 +357,7 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index 2099055e641c4..a2a1bfd30d8d8 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -4,7 +4,7 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -54,7 +54,7 @@ class FusedMoEBlock(nn.Module):
     def __init__(
         self,
         config: ModelConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -106,7 +106,7 @@ class Step3TextMLP(nn.Module):
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -146,12 +146,12 @@ class Step3TextAttention(nn.Module):
         num_kv_heads: int,
         norm_eps: float,
         rope_theta: int,
-        share_q_dim: Optional[int] = None,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        share_q_dim: int | None = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embedding: int = 8192,
         head_dim: int = 256,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -229,8 +229,8 @@ class Step3TextDecoderLayer(nn.Module):
     def __init__(
         self,
         config: ModelConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -291,7 +291,7 @@ class Step3TextDecoderLayer(nn.Module):
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        residual: Optional[torch.Tensor],
+        residual: torch.Tensor | None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
             residual = hidden_states
@@ -362,8 +362,8 @@ class Step3TextModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
@@ -436,8 +436,8 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
     ):
         hidden_states = self.model(
             input_ids, positions, intermediate_tensors, inputs_embeds
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 5ec7845a122f7..dbb549ba3f985 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -4,7 +4,7 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from itertools import product
 from math import ceil, sqrt
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import numpy as np
 import torch
@@ -71,7 +71,7 @@ class Step3VLImagePixelInputs(TensorSchema):
     type: Literal["pixel_values"]
     pixel_values: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
     patch_pixel_values: Annotated[
-        Optional[torch.Tensor], TensorShape("bnp", 3, "hp", "wp")
+        torch.Tensor | None, TensorShape("bnp", 3, "hp", "wp")
     ]
     num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
@@ -88,7 +88,7 @@ class Step3VLImageEmbeddingInputs(TensorSchema):
     data: Annotated[torch.Tensor, TensorShape("bn", "f", "h")]
 
 
-Step3VLImageInputs = Union[Step3VLImagePixelInputs, Step3VLImageEmbeddingInputs]
+Step3VLImageInputs: TypeAlias = Step3VLImagePixelInputs | Step3VLImageEmbeddingInputs
 
 ImageWithPatches = tuple[Image.Image, list[Image.Image], list[int] | None]
 
@@ -409,7 +409,7 @@ class Step3VLProcessor:
         self,
         num_images: int,
         num_patches: int,
-        patch_new_line_idx: Optional[list[bool]],
+        patch_new_line_idx: list[bool] | None,
     ) -> tuple[str, list[int]]:
         if num_patches > 0:
             patch_repl, patch_repl_ids = self._get_patch_repl(
@@ -438,9 +438,9 @@ class Step3VLProcessor:
 
     def __call__(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        return_tensors: str | TensorType | None = None,
     ) -> BatchFeature:
         if text is None:
             text = []
@@ -513,7 +513,7 @@ class Step3VLProcessingInfo(BaseProcessingInfo):
             self.get_tokenizer(),
         )
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_max_image_tokens(self) -> int:
@@ -556,7 +556,7 @@ class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         target_width, target_height = self.info.get_image_size_with_most_features()
         num_images = mm_counts.get("image", 0)
@@ -716,7 +716,7 @@ class Step3VisionAttention(nn.Module):
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -778,7 +778,7 @@ class Step3VisionMLP(nn.Module):
     def __init__(
         self,
         config,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -813,7 +813,7 @@ class Step3VisionEncoderLayer(nn.Module):
     def __init__(
         self,
         config: Step3VisionEncoderConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -848,7 +848,7 @@ class Step3VisionEncoder(nn.Module):
     def __init__(
         self,
         config: Step3VisionEncoderConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -881,7 +881,7 @@ class Step3VisionTransformer(nn.Module):
     def __init__(
         self,
         config: Step3VisionEncoderConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
         use_data_parallel: bool = False,
     ):
@@ -927,7 +927,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     supports_encoder_tp_data = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<im_patch>"
 
@@ -994,7 +994,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[Step3VLImageInputs]:
+    ) -> Step3VLImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         patch_pixel_values = kwargs.pop("patch_pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
@@ -1085,9 +1085,9 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         # Multi-modal token ID may exceed vocab size
         handle_oov_mm_token: bool = True,
     ) -> torch.Tensor:
@@ -1106,10 +1106,10 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
@@ -1130,7 +1130,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
diff --git a/vllm/model_executor/models/swin.py b/vllm/model_executor/models/swin.py
index 485c008e830a9..a74fd80c06d8c 100644
--- a/vllm/model_executor/models/swin.py
+++ b/vllm/model_executor/models/swin.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -28,7 +27,7 @@ class SwinSelfAttention(nn.Module):
         dim: int,
         num_heads: int,
         window_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -102,9 +101,9 @@ class SwinSelfAttention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
+        attention_mask: torch.FloatTensor | None = None,
+        head_mask: torch.FloatTensor | None = None,
+        output_attentions: bool | None = False,
     ) -> tuple[torch.Tensor, ...]:
         batch_size, dim, num_channels = hidden_states.shape
 
@@ -155,7 +154,7 @@ class SwinSelfOutput(nn.Module):
         self,
         config: SwinConfig,
         dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -181,7 +180,7 @@ class SwinAttention(nn.Module):
         dim: int,
         num_heads: int,
         window_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -201,9 +200,9 @@ class SwinAttention(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
+        attention_mask: torch.FloatTensor | None = None,
+        head_mask: torch.FloatTensor | None = None,
+        output_attentions: bool | None = False,
     ) -> tuple[torch.Tensor]:
         self_outputs = self.self(
             hidden_states, attention_mask, head_mask, output_attentions
@@ -218,7 +217,7 @@ class SwinIntermediate(nn.Module):
         self,
         config: SwinConfig,
         dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -241,7 +240,7 @@ class SwinOutput(nn.Module):
         self,
         config: SwinConfig,
         dim: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -266,7 +265,7 @@ class SwinLayer(HFSwinLayer):
         num_heads: int,
         drop_path_rate: float = 0.0,
         shift_size: int = 0,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__(
@@ -303,8 +302,8 @@ class SwinStage(nn.Module):
         depth: int,
         num_heads: int,
         drop_path: list[float],
-        downsample: Optional[SwinPatchMerging] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        downsample: SwinPatchMerging | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -340,9 +339,9 @@ class SwinStage(nn.Module):
         self,
         hidden_states: torch.Tensor,
         input_dimensions: tuple[int, int],
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-        always_partition: Optional[bool] = False,
+        head_mask: torch.FloatTensor | None = None,
+        output_attentions: bool | None = False,
+        always_partition: bool | None = False,
     ) -> tuple[torch.Tensor]:
         height, width = input_dimensions
         for i, layer_module in enumerate(self.blocks):
@@ -384,7 +383,7 @@ class SwinEncoder(nn.Module):
         self,
         config: SwinConfig,
         grid_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -426,9 +425,9 @@ class SwinEncoder(nn.Module):
         self,
         hidden_states: torch.Tensor,
         input_dimensions: tuple[int, int],
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-        always_partition: Optional[bool] = False,
+        head_mask: torch.FloatTensor | None = None,
+        output_attentions: bool | None = False,
+        always_partition: bool | None = False,
     ) -> tuple[torch.Tensor]:
         for i, layer_module in enumerate(self.layers):
             layer_head_mask = head_mask[i] if head_mask is not None else None
@@ -455,7 +454,7 @@ class SwinModel(nn.Module):
     def __init__(
         self,
         config: SwinConfig,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -473,9 +472,9 @@ class SwinModel(nn.Module):
 
     def forward(
         self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
+        pixel_values: torch.FloatTensor | None = None,
+        head_mask: torch.FloatTensor | None = None,
+        output_attentions: bool | None = None,
     ) -> tuple[torch.Tensor]:
         embedding_output, input_dimensions = self.embeddings(pixel_values)
 
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index 6a224fe9288b2..bfa1b5bbaf84f 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -3,7 +3,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Final, Literal, Optional, Protocol, TypeVar, Union
+from typing import Annotated, Final, Literal, Protocol, TypeAlias, TypeVar
 
 import torch
 import torch.nn as nn
@@ -81,7 +81,7 @@ class TarsierImageEmbeddingInputs(TensorSchema):
     data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
-TarsierImageInputs = Union[TarsierImagePixelInputs, TarsierImageEmbeddingInputs]
+TarsierImageInputs: TypeAlias = TarsierImagePixelInputs | TarsierImageEmbeddingInputs
 
 
 class TarsierHfConfig(Protocol):  # Based on the Tarsier's LlavaConfig
@@ -89,7 +89,7 @@ class TarsierHfConfig(Protocol):  # Based on the Tarsier's LlavaConfig
     text_config: Final[PretrainedConfig]  # Added from Tarsier's LlavaConfig
     image_token_index: Final[int]
     vision_feature_select_strategy: Final[str]
-    vision_feature_layer: Final[Union[int, list[int]]]
+    vision_feature_layer: Final[int | list[int]]
     projector_hidden_act: Final[str]
     image_newline_idx: Final[int]
     image_new_idx: Final[int]
@@ -109,9 +109,10 @@ class TarsierProcessor(LlavaProcessor):
     def __call__(
         self,
         images: ImageInput = None,
-        text: Union[
-            TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
-        ] = None,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
         audio=None,
         videos=None,
         **kwargs: Unpack[TarsierProcessorKwargs],
@@ -173,7 +174,7 @@ class TarsierMultiModalProjector(nn.Module):
         text_hidden_size: int,
         projector_hidden_act: str,
         multimodal_projector_bias: bool,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -215,7 +216,7 @@ class TarsierProcessingInfo(BaseProcessingInfo):
 
         return self.ctx.get_hf_processor(TarsierProcessor, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
     def get_num_image_tokens(
@@ -331,7 +332,7 @@ def _build_tarsier_hf_processor(
     info: _I_Tarsier,
     dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier],
     *,
-    cache: Optional[BaseMultiModalProcessorCache] = None,
+    cache: BaseMultiModalProcessorCache | None = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, TarsierProcessingInfo):
         return TarsierMultiModalProcessor(
@@ -344,11 +345,11 @@ def _build_tarsier_hf_processor(
 
 def init_vision_tower_for_tarsier(
     hf_config: TarsierHfConfig,  # Use the Tarsier specific config protocol
-    quant_config: Optional[QuantizationConfig],
+    quant_config: QuantizationConfig | None,
     *,
-    require_post_norm: Optional[bool] = None,
+    require_post_norm: bool | None = None,
     prefix: str = "",
-) -> Union[CLIPVisionModel, SiglipVisionModel]:
+) -> CLIPVisionModel | SiglipVisionModel:
     vision_config = hf_config.vision_config
 
     feature_layers = hf_config.vision_feature_layer
@@ -407,7 +408,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     }
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return "<image>"
 
@@ -456,7 +457,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
 
     def _parse_and_validate_image_input(
         self, **kwargs: object
-    ) -> Optional[TarsierImageInputs]:
+    ) -> TarsierImageInputs | None:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
@@ -479,9 +480,9 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
 
     def _image_pixels_to_features(
         self,
-        vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
-        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        vision_tower: CLIPVisionModel | SiglipVisionModel,
+        pixel_values: torch.Tensor | list[torch.Tensor],
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         # From vLLM LLaVA, vision tower output handling
         return vision_tower(
             pixel_values,
@@ -540,7 +541,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     def _process_image_pixels(
         self,
         inputs: TarsierImagePixelInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         assert self.vision_tower is not None
         pixel_values = inputs["pixel_values"]
         image_features_selected = self._image_pixels_to_features(
@@ -559,7 +560,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     def _process_image_input(
         self,
         image_input: TarsierImageInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             projected_features = image_input["data"]
             if isinstance(projected_features, torch.Tensor):
@@ -585,10 +586,10 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
         elif inputs_embeds is None:
@@ -610,7 +611,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index 13d2e8eacc013..e8506666db5bc 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -18,8 +18,8 @@
 """Wrapper around `Terratorch` models"""
 
 from collections import OrderedDict
-from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -96,7 +96,7 @@ def _terratorch_field_factory(
 
 
 class TerratorchProcessingInfo(BaseProcessingInfo):
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None}
 
 
@@ -114,7 +114,7 @@ class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         # Dummy data is generated based on the 'input' section
         # defined in the HF configuration file
@@ -136,8 +136,8 @@ class TerratorchMultiModalDataParser(MultiModalDataParser):
 
     def _parse_image_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
+    ) -> ModalityDataItems[Any, Any] | None:
         if isinstance(data, dict):
             terratorch_fields = _terratorch_field_names(self._pretrained_cfg)
 
@@ -157,7 +157,7 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
         info: TerratorchProcessingInfo,
         dummy_inputs: "BaseDummyInputsBuilder[TerratorchProcessingInfo]",
         *,
-        cache: Optional[MultiModalProcessorOnlyCache] = None,
+        cache: MultiModalProcessorOnlyCache | None = None,
     ) -> None:
         self.pretrained_cfg = info.get_hf_config().to_dict()["pretrained_cfg"]
         super().__init__(info=info, dummy_inputs=dummy_inputs, cache=cache)
@@ -182,11 +182,11 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor):
 
     def apply(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        tokenization_kwargs: Mapping[str, object] | None = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
         if "image" in mm_data:
             image_data = mm_data["image"]
@@ -232,7 +232,7 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
     is_pooling_model = True
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("image"):
             return None
 
@@ -256,9 +256,9 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         # We do not really use any input tokens and therefore no embeddings
@@ -269,10 +269,10 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ):
         model_output = self.inference_runner.forward(**kwargs)
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 1cfe401b243c7..82f5410ece63f 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -19,7 +19,7 @@
 from collections.abc import Iterable, Mapping
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Literal, Optional, Union
+from typing import Literal
 
 import regex as re
 import torch
@@ -103,9 +103,9 @@ def vllm_flash_attention_forward(
     value: torch.Tensor,
     attention_mask: torch.Tensor,
     # Transformers kwargs
-    scaling: Optional[float] = None,
+    scaling: float | None = None,
     # vLLM kwargs
-    attention_instances: Optional[dict[Attention]] = None,
+    attention_instances: dict[Attention] | None = None,
     **kwargs,
 ):
     self_attn = attention_instances[module.layer_idx]
@@ -147,10 +147,10 @@ Style = Literal["colwise", "colwise_rep", "rowwise", "rowwise_rep", "replicate"]
 def replace_linear_class(
     linear: nn.Linear,
     style: Style = "replicate",
-    quant_config: Optional[QuantizationConfig] = None,
+    quant_config: QuantizationConfig | None = None,
     *,
     prefix: str = "",
-) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
+) -> ColumnParallelLinear | RowParallelLinear | ReplicatedLinear:
     """
     Replace nn.Linear with one of vLLM's tensor parallel linear classes.
 
@@ -312,7 +312,7 @@ class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder[MultiModalProcessingIn
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_images = mm_counts.get("image", 0)
 
@@ -388,11 +388,11 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
 
     def apply(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        tokenization_kwargs: Mapping[str, object] | None = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -498,7 +498,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         self.device_config: DeviceConfig = vllm_config.device_config
         self.model_config: ModelConfig = vllm_config.model_config
         self.parallel_config: ParallelConfig = vllm_config.parallel_config
-        self.quant_config: Optional[QuantizationConfig] = vllm_config.quant_config
+        self.quant_config: QuantizationConfig | None = vllm_config.quant_config
 
         self.pp_group = get_pp_group()
         self.tp_group = get_tp_group()
@@ -714,7 +714,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
             )
         return attention_instances
 
-    def init_parameters(self, module: nn.Module, dtype: Optional[torch.dtype] = None):
+    def init_parameters(self, module: nn.Module, dtype: torch.dtype | None = None):
         """
         If a `parameter` is on the `meta` device, then its parent
         `module` is the original module created by:
@@ -725,7 +725,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
         ```
         """
 
-        def _init_parameters(module: nn.Module, dtype: Optional[torch.dtype]):
+        def _init_parameters(module: nn.Module, dtype: torch.dtype | None):
             for name, param in module.named_parameters(recurse=False):
                 if param.device == torch.device("meta"):
                     new_param = nn.Parameter(
@@ -743,12 +743,12 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if not self.pp_group.is_first_rank:
             assert intermediate_tensors is not None
             input_ids = None
@@ -841,7 +841,7 @@ class TransformersForCausalLM(TransformersBase):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
@@ -895,12 +895,12 @@ class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         # Gemma3 and PaliGemma needs `token_type_ids` to work correctly
         # Other models will not have `token_type_ids` in kwargs
         kwargs = {k: v for k, v in kwargs.items() if k == "token_type_ids"}
@@ -924,8 +924,8 @@ class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal):
         return LanguageModelWrapper(self)
 
     def get_multimodal_embeddings(self, **kwargs):
-        pixel_values: Optional[torch.Tensor] = kwargs.pop("pixel_values", None)
-        image_embeds: Optional[torch.Tensor] = kwargs.pop("image_embeds", None)
+        pixel_values: torch.Tensor | None = kwargs.pop("pixel_values", None)
+        image_embeds: torch.Tensor | None = kwargs.pop("image_embeds", None)
         # Model might use `image_patches` instead of `pixel_values`
         if pixel_values is None:
             pixel_values = kwargs.pop("image_patches", None)
diff --git a/vllm/model_executor/models/transformers_pooling.py b/vllm/model_executor/models/transformers_pooling.py
index 98d2611351c03..411fb92e9460b 100644
--- a/vllm/model_executor/models/transformers_pooling.py
+++ b/vllm/model_executor/models/transformers_pooling.py
@@ -16,8 +16,6 @@
 # limitations under the License.
 """Wrapper around `transformers` models for pooling tasks."""
 
-from typing import Optional, Union
-
 import torch
 from transformers import AutoModelForSequenceClassification
 
@@ -109,11 +107,11 @@ class TransformersPoolingBase(TransformersBase, VllmModelForPooling):
 
     def forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         if self.is_roberta:
             # RoBERTa-specific positions padding
             positions += self.padding_idx + 1
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 1fc34f48401df..95d574fb81d7a 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -5,7 +5,7 @@
 """PyTorch Ultravox model."""
 
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal, TypeAlias
 
 import torch
 from torch import nn
@@ -68,7 +68,7 @@ class UltravoxAudioFeatureInputs(TensorSchema):
 
     type: Literal["audio_features"]
     data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]],
+        torch.Tensor | list[torch.Tensor] | list[list[torch.Tensor]],
         TensorShape("bn", "nmb", "t"),
     ]
     lens: Annotated[torch.Tensor, TensorShape("bn")]
@@ -92,11 +92,13 @@ class UltravoxAudioEmbeddingInputs(TensorSchema):
 
     type: Literal["audio_embeds"]
     data: Annotated[
-        Union[torch.Tensor, list[torch.Tensor]], TensorShape("b", "na", "afs", "hs")
+        torch.Tensor | list[torch.Tensor], TensorShape("b", "na", "afs", "hs")
     ]
 
 
-UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs, UltravoxAudioEmbeddingInputs]
+UltravoxAudioInputs: TypeAlias = (
+    UltravoxAudioFeatureInputs | UltravoxAudioEmbeddingInputs
+)
 
 
 class UltravoxProcessingInfo(BaseProcessingInfo):
@@ -119,7 +121,7 @@ class UltravoxProcessingInfo(BaseProcessingInfo):
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": None}
 
 
@@ -133,7 +135,7 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo])
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         feature_extractor = self.info.get_feature_extractor()
 
@@ -346,7 +348,7 @@ class ModifiedWhisperEncoder(WhisperEncoder):
         )
 
     def get_attention_mask_by_audio_len(
-        self, audio_lens: Optional[torch.Tensor], hidden_states: torch.Tensor
+        self, audio_lens: torch.Tensor | None, hidden_states: torch.Tensor
     ):
         """
         Create attention mask based on audio lengths to mask out padding tokens
@@ -376,7 +378,7 @@ class ModifiedWhisperEncoder(WhisperEncoder):
     def forward(
         self,
         input_features: torch.Tensor,
-        audio_lens: Optional[torch.Tensor] = None,
+        audio_lens: torch.Tensor | None = None,
     ):
         expected_seq_length = self.max_context_length
         if input_features.shape[-1] > expected_seq_length:
@@ -431,7 +433,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
     )
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("audio"):
             return "<|audio|>"
 
@@ -514,7 +516,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
 
     def _parse_and_validate_audio_input(
         self, **kwargs: object
-    ) -> Optional[UltravoxAudioInputs]:
+    ) -> UltravoxAudioInputs | None:
         audio_features = kwargs.pop("audio_features", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
         audio_lens = kwargs.pop("audio_lens", None)
@@ -541,7 +543,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
     def _process_audio_input(
         self,
         audio_input: UltravoxAudioInputs,
-    ) -> Union[NestedTensors, tuple[torch.Tensor, ...]]:
+    ) -> NestedTensors | tuple[torch.Tensor, ...]:
         if audio_input["type"] == "audio_embeds":
             return audio_input["data"]
 
@@ -587,9 +589,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         # Multi-modal token ID may exceed vocab size
         handle_oov_mm_token: bool = True,
     ) -> torch.Tensor:
@@ -608,10 +610,10 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: torch.Tensor | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         """Run forward pass for Ultravox
 
         One key thing to understand is the `input_ids` already accounts for the
@@ -651,7 +653,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
 
 
 def pad_and_concat_to_dim3(
-    features: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]],
+    features: torch.Tensor | list[torch.Tensor] | list[list[torch.Tensor]],
 ) -> torch.Tensor:
     """
     Pad and concatenate a list of tensors.
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index bd530be73c2ad..8812ed177f568 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -4,7 +4,7 @@
 import itertools
 from collections.abc import Iterable, Mapping
 from dataclasses import dataclass, field
-from typing import Any, Literal, Optional, Protocol, Union, overload
+from typing import Any, Literal, Protocol, overload
 
 import torch
 import torch.nn as nn
@@ -32,7 +32,7 @@ from vllm.utils import (
 
 logger = init_logger(__name__)
 
-WeightsMapping = Mapping[str, Optional[str]]
+WeightsMapping = Mapping[str, str | None]
 """If a key maps to a value of `None`, the corresponding weight is ignored."""
 
 
@@ -44,7 +44,7 @@ class WeightsMapper:
     orig_to_new_prefix: WeightsMapping = field(default_factory=dict)
     orig_to_new_suffix: WeightsMapping = field(default_factory=dict)
 
-    def _map_name(self, key: str) -> Optional[str]:
+    def _map_name(self, key: str) -> str | None:
         for substr, new_key in self.orig_to_new_substr.items():
             if substr in key:
                 if new_key is None:
@@ -120,10 +120,10 @@ class AutoWeightsLoader:
         self,
         module: nn.Module,
         *,
-        skip_prefixes: Optional[list[str]] = None,
-        skip_substrs: Optional[list[str]] = None,
-        ignore_unexpected_prefixes: Optional[list[str]] = None,
-        ignore_unexpected_suffixes: Optional[list[str]] = None,
+        skip_prefixes: list[str] | None = None,
+        skip_substrs: list[str] | None = None,
+        ignore_unexpected_prefixes: list[str] | None = None,
+        ignore_unexpected_suffixes: list[str] | None = None,
     ) -> None:
         super().__init__()
 
@@ -306,7 +306,7 @@ class AutoWeightsLoader:
         self,
         weights: Iterable[tuple[str, torch.Tensor]],
         *,
-        mapper: Optional[WeightsMapper] = None,
+        mapper: WeightsMapper | None = None,
     ) -> set[str]:
         if mapper is not None:
             weights = mapper.apply(weights)
@@ -323,8 +323,8 @@ def init_vllm_registered_model(
     vllm_config: VllmConfig,
     *,
     prefix: str = "",
-    hf_config: Optional[PretrainedConfig] = None,
-    architectures: Optional[list[str]] = None,
+    hf_config: PretrainedConfig | None = None,
+    architectures: list[str] | None = None,
 ) -> nn.Module:
     """
     Helper function to initialize an inner model registered to vLLM,
@@ -352,7 +352,7 @@ def flatten_bn(x: list[torch.Tensor]) -> list[torch.Tensor]: ...
 
 @overload
 def flatten_bn(
-    x: Union[list[torch.Tensor], torch.Tensor],
+    x: list[torch.Tensor] | torch.Tensor,
     *,
     concat: Literal[True],
 ) -> torch.Tensor: ...
@@ -360,17 +360,17 @@ def flatten_bn(
 
 @overload
 def flatten_bn(
-    x: Union[list[torch.Tensor], torch.Tensor],
+    x: list[torch.Tensor] | torch.Tensor,
     *,
     concat: bool = False,
-) -> Union[list[torch.Tensor], torch.Tensor]: ...
+) -> list[torch.Tensor] | torch.Tensor: ...
 
 
 def flatten_bn(
-    x: Union[list[torch.Tensor], torch.Tensor],
+    x: list[torch.Tensor] | torch.Tensor,
     *,
     concat: bool = False,
-) -> Union[list[torch.Tensor], torch.Tensor]:
+) -> list[torch.Tensor] | torch.Tensor:
     """
     Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs.
 
@@ -472,7 +472,7 @@ def merge_multimodal_embeddings(
     input_ids: torch.Tensor,
     inputs_embeds: torch.Tensor,
     multimodal_embeddings: NestedTensors,
-    placeholder_token_id: Union[int, list[int]],
+    placeholder_token_id: int | list[int],
 ) -> torch.Tensor:
     """
     Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index e517109e94dd6..bd5a6cf018d2e 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -4,7 +4,8 @@
 import itertools
 import math
 from abc import ABC, abstractmethod
-from typing import Callable, Final, Generic, Literal, Optional, Protocol, TypeVar, Union
+from collections.abc import Callable
+from typing import Final, Generic, Literal, Protocol, TypeAlias, TypeVar
 
 import torch
 from transformers import PretrainedConfig
@@ -84,7 +85,7 @@ def get_vit_attn_backend(head_size: int, dtype: torch.dtype) -> _Backend:
     # Lazy import to avoid circular dependency
     from vllm.attention.selector import get_env_variable_attn_backend
 
-    selected_backend: Optional[_Backend] = get_env_variable_attn_backend()
+    selected_backend: _Backend | None = get_env_variable_attn_backend()
     if selected_backend is not None:
         return selected_backend
 
@@ -93,14 +94,13 @@ def get_vit_attn_backend(head_size: int, dtype: torch.dtype) -> _Backend:
 
 VisionFeatureSelectStrategyStr = Literal["class", "default", "full"]
 
-VisionFeatureSelectStrategy = Union[
-    VisionFeatureSelectStrategyStr,
-    Callable[[torch.Tensor], torch.Tensor],
-]
+VisionFeatureSelectStrategy: TypeAlias = (
+    VisionFeatureSelectStrategyStr | Callable[[torch.Tensor], torch.Tensor]
+)
 
 
 def _get_vision_feature_selector(
-    strategy: Union[VisionFeatureSelectStrategy, str],
+    strategy: VisionFeatureSelectStrategy | str,
 ) -> Callable[[torch.Tensor], torch.Tensor]:
     if callable(strategy):
         return strategy
@@ -121,7 +121,7 @@ def _get_vision_feature_selector(
 
 def get_num_selected_vision_tokens(
     num_vision_tokens: int,
-    strategy: Union[VisionFeatureSelectStrategy, str],
+    strategy: VisionFeatureSelectStrategy | str,
 ) -> int:
     if callable(strategy):
         dummy_features = torch.empty(1, num_vision_tokens, 64)  # [B, L, D]
@@ -141,12 +141,12 @@ def get_num_selected_vision_tokens(
 
 
 def resolve_visual_encoder_outputs(
-    encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
-    post_layer_norm: Optional[torch.nn.LayerNorm],
+    encoder_outputs: torch.Tensor | list[torch.Tensor],
+    post_layer_norm: torch.nn.LayerNorm | None,
     *,
-    select_layers: Optional[list[int]] = None,
-    max_possible_layers: Optional[int] = None,
-    feature_select_strategy: Optional[VisionFeatureSelectStrategy] = None,
+    select_layers: list[int] | None = None,
+    max_possible_layers: int | None = None,
+    feature_select_strategy: VisionFeatureSelectStrategy | None = None,
 ) -> torch.Tensor:
     """Given the outputs a visual encoder module that may correspond to the
     output of the last layer, or a list of hidden states to be stacked,
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index f4bfbd26756e1..cce18984b67e4 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -5,7 +5,7 @@ import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from math import ceil
-from typing import Literal, Optional, Union, cast
+from typing import Literal, cast
 
 import numpy as np
 import regex as re
@@ -125,9 +125,9 @@ class VoxtralProcessorAdapter:
 
     def __call__(
         self,
-        text: Optional[Union[TextInput, list[TextInput]]] = None,
-        audios: Optional[Union[np.ndarray, list[np.ndarray]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        text: TextInput | list[TextInput] | None = None,
+        audios: np.ndarray | list[np.ndarray] | None = None,
+        return_tensors: str | TensorType | None = None,
         **kwargs,
     ) -> Mapping[str, NestedTensors]:
         if text is None:
@@ -188,7 +188,7 @@ class VoxtralProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(self) -> VoxtralProcessorAdapter:
         return VoxtralProcessorAdapter(self.get_tokenizer())
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": 5}  # Performance tends to degrade after 5
 
     def get_mm_max_tokens_per_item(
@@ -216,7 +216,7 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
 
@@ -234,7 +234,7 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> ProcessorInputs:
         tokenizer = self.info.get_tokenizer()
 
@@ -303,11 +303,11 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo])
 
     def _cached_apply_hf_processor(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
@@ -386,10 +386,10 @@ class VoxtralForConditionalGeneration(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+    ) -> torch.Tensor | IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -401,7 +401,7 @@ class VoxtralForConditionalGeneration(
 
     def get_multimodal_embeddings(
         self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...], None]:
+    ) -> list[torch.Tensor] | torch.Tensor | tuple[torch.Tensor, ...] | None:
         audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
         if audio_inputs is None:
             return None
@@ -433,7 +433,7 @@ class VoxtralForConditionalGeneration(
 
     def _parse_and_validate_audio_arrays(
         self, **kwargs: object
-    ) -> Union[list[torch.Tensor], None]:
+    ) -> list[torch.Tensor] | None:
         audio_arrays = kwargs.pop("audio_arrays", None)
         if audio_arrays is None:
             return None
@@ -450,7 +450,7 @@ class VoxtralForConditionalGeneration(
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
     @classmethod
@@ -475,10 +475,10 @@ class VoxtralForConditionalGeneration(
         audio: np.ndarray,
         model_config: ModelConfig,
         stt_config: SpeechToTextConfig,
-        language: Optional[str],
+        language: str | None,
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
-        to_language: Optional[str],
+        to_language: str | None,
     ) -> PromptType:
         tokenizer = cached_tokenizer_from_config(model_config)
         audio = Audio(audio, int(stt_config.sample_rate), format="wav")  # lossless
@@ -500,7 +500,7 @@ class VoxtralForConditionalGeneration(
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
         model_config: ModelConfig,
-    ) -> Optional[int]:
+    ) -> int | None:
         """
         Map from audio duration to number of audio tokens produced by the ASR
         model, without running a forward pass.
@@ -793,7 +793,7 @@ class VoxtralEncoderModel(nn.Module):
         return torch.stack(chunked_features), chunks_per_example
 
     def forward(
-        self, input_features: Union[torch.Tensor, list[torch.Tensor]]
+        self, input_features: torch.Tensor | list[torch.Tensor]
     ) -> list[torch.Tensor]:
         if not isinstance(input_features, list):
             input_features = [input_features]
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 397556cbbcc47..0246e0739b0fd 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -4,7 +4,7 @@
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from contextlib import nullcontext
-from typing import Annotated, Literal, Optional, Union, cast
+from typing import Annotated, Literal, cast
 
 import numpy as np
 import torch
@@ -137,7 +137,7 @@ class WhisperAudioInputs(TensorSchema):
     """
 
     input_features: Annotated[
-        Optional[list[torch.Tensor]],
+        list[torch.Tensor] | None,
         TensorShape("b", "nmb", "t"),
     ]
 
@@ -185,8 +185,8 @@ class WhisperAttention(nn.Module):
         num_heads: int,
         bias: bool = True,
         attn_type: AttentionType = AttentionType.DECODER,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -258,7 +258,7 @@ class WhisperAttention(nn.Module):
         self,
         embed_dim: int,
         bias: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         self.qkv_proj = QKVParallelLinear(
@@ -291,8 +291,8 @@ class WhisperCrossAttention(WhisperAttention):
         embed_dim: int,
         num_heads: int,
         bias: bool = True,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__(
@@ -309,7 +309,7 @@ class WhisperCrossAttention(WhisperAttention):
         self,
         embed_dim: int,
         bias: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         self.q_proj = ColumnParallelLinear(
@@ -332,7 +332,7 @@ class WhisperCrossAttention(WhisperAttention):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor],
+        encoder_hidden_states: torch.Tensor | None,
     ):
         q, _ = self.q_proj(hidden_states)
 
@@ -357,7 +357,7 @@ class WhisperMLP(nn.Module):
         embed_dim: int,
         ffn_dim: int,
         act_fn: str,
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         super().__init__()
@@ -463,7 +463,7 @@ class WhisperDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor],
+        encoder_hidden_states: torch.Tensor | None,
     ):
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -521,7 +521,7 @@ class WhisperEncoder(nn.Module):
                 sinusoids(*self.embed_positions.weight.shape)
             )
 
-    def forward(self, input_features: Union[torch.Tensor, list[torch.Tensor]]):
+    def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
         hidden_states = []
         for features in input_features:
             embeds = nn.functional.gelu(self.conv1(features))
@@ -569,7 +569,7 @@ class WhisperDecoder(nn.Module):
         self,
         input_ids,
         positions: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor],
+        encoder_hidden_states: torch.Tensor | None,
     ):
         inputs_embeds = self.get_input_embeddings(input_ids)
         positions = self.embed_positions(positions)
@@ -600,8 +600,8 @@ class WhisperModel(nn.Module):
 
     def forward(
         self,
-        input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]],
-        input_ids: Optional[torch.Tensor],
+        input_features: torch.Tensor | list[torch.Tensor] | None,
+        input_ids: torch.Tensor | None,
         positions: torch.Tensor,
     ) -> torch.Tensor:
         encoder_outputs = self.get_encoder_outputs(input_features)
@@ -614,8 +614,8 @@ class WhisperModel(nn.Module):
 
     def get_encoder_outputs(
         self,
-        input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]],
-    ) -> Optional[torch.Tensor]:
+        input_features: torch.Tensor | list[torch.Tensor] | None,
+    ) -> torch.Tensor | None:
         if input_features is None:
             return None
         return self.encoder(input_features)
@@ -670,7 +670,7 @@ class WhisperProcessingInfo(BaseProcessingInfo):
             processor_class.tokenizer_class = tokenizer_class
         return self.ctx.get_hf_processor(processor_class, **kwargs)
 
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"audio": 1}
 
     def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor:
@@ -693,7 +693,7 @@ class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         feature_extractor = self.info.get_feature_extractor()
 
@@ -721,9 +721,9 @@ class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo
 
     def create_encoder_prompt(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
-    ) -> Union[str, list[int]]:
+    ) -> str | list[int]:
         # Strictly speaking, whisper encoder only accept audio features.
         # We create a dummy encoder prompt here which will be padded to
         # num_audio_tokens. So that we can create dummy data from this
@@ -804,7 +804,7 @@ class WhisperForConditionalGeneration(
     supported_languages = ISO639_1_SUPPORTED_LANGS
 
     @classmethod
-    def validate_language(cls, language: Optional[str]) -> Optional[str]:
+    def validate_language(cls, language: str | None) -> str | None:
         if language is None:
             # TODO language should be optional and can be guessed.
             # For now we default to en. See
@@ -823,10 +823,10 @@ class WhisperForConditionalGeneration(
         audio: np.ndarray,
         model_config: ModelConfig,  # not needed here
         stt_config: SpeechToTextConfig,
-        language: Optional[str],
+        language: str | None,
         task_type: Literal["transcribe", "translate"],
         request_prompt: str,
-        to_language: Optional[str],
+        to_language: str | None,
     ) -> PromptType:
         if language is None:
             raise ValueError(
@@ -849,7 +849,7 @@ class WhisperForConditionalGeneration(
         return cast(PromptType, prompt)
 
     @classmethod
-    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         if modality.startswith("audio"):
             return None
 
@@ -872,7 +872,7 @@ class WhisperForConditionalGeneration(
         audio_duration_s: float,
         stt_config: SpeechToTextConfig,
         model_config: ModelConfig,
-    ) -> Optional[int]:
+    ) -> int | None:
         processor = cached_get_processor(model_config.model)
         hop_length = processor.feature_extractor.hop_length
         assert hop_length is not None
@@ -928,9 +928,9 @@ class WhisperForConditionalGeneration(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
         *,
-        is_multimodal: Optional[torch.Tensor] = None,
+        is_multimodal: torch.Tensor | None = None,
         handle_oov_mm_token: bool = False,
     ) -> torch.Tensor:
         # This method just returns the decoder sequence embeddings since
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index b69204d020962..2610aa253b575 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -10,7 +10,7 @@ model alternates between state space model layers and attention-based layers.
 
 from collections.abc import Iterable
 from itertools import cycle
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from torch import nn
@@ -60,8 +60,8 @@ class Zamba2LoRA(nn.Module):
         self,
         input_dim: int,
         rank: int,
-        output_dim: Union[int, list[int]],
-        quant_config: Optional[QuantizationConfig] = None,
+        output_dim: int | list[int],
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ):
         """Initialize the attention layer.
@@ -106,8 +106,8 @@ class Zamba2Attention(nn.Module):
         config: Zamba2Config,
         bare_block_idx: int,
         num_hybrid_layers: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         """Initialize the attention layer.
@@ -288,7 +288,7 @@ class Zamba2MLP(nn.Module):
         config: Zamba2Config,
         bare_block_idx: int,
         num_hybrid_layers: dict[int, int],
-        quant_config: Optional[QuantizationConfig] = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         """Initialize the MLP layer.
@@ -386,8 +386,8 @@ class Zamba2AttentionDecoderLayer(nn.Module):
         config: Zamba2Config,
         bare_block_idx: int,
         num_hybrid_layers: int,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         """Initialize the decoder layer.
@@ -484,9 +484,9 @@ class Zamba2MambaDecoderLayer(nn.Module):
     def __init__(
         self,
         config: Zamba2Config,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         """Initialize the Mamba decoder layer.
@@ -523,9 +523,9 @@ class Zamba2MambaDecoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        transformer_hidden_states: Optional[torch.Tensor] = None,
-        positions: Optional[torch.Tensor] = None,
-        original_hidden_states: Optional[torch.Tensor] = None,
+        transformer_hidden_states: torch.Tensor | None = None,
+        positions: torch.Tensor | None = None,
+        original_hidden_states: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass through the Mamba decoder layer.
 
@@ -581,9 +581,9 @@ class Zamba2HybridLayer(nn.Module):
         shared_transformer: Zamba2AttentionDecoderLayer,
         config: Zamba2Config,
         block_idx: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        model_config: ModelConfig | None = None,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         """Initialize the hybrid layer.
@@ -764,8 +764,8 @@ class Zamba2Model(nn.Module):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
         """Forward pass through the model.
 
         Args:
@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         **kwargs: Any,
     ) -> torch.Tensor:
         """Forward pass through the model.
@@ -973,7 +973,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """Compute logits for next token prediction.
 
         Args:
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 9341665f1bca2..fd21a3244eb35 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Hashable
+from collections.abc import Callable, Hashable
 from fractions import Fraction
-from typing import Callable, Optional, Union
 from weakref import WeakValueDictionary
 
 import torch
@@ -36,7 +35,7 @@ class BasevLLMParameter(Parameter):
     into the parameter when the provided weight loader is called.
     """
 
-    def __new__(cls, data: Optional[torch.Tensor], **kwargs):
+    def __new__(cls, data: torch.Tensor | None, **kwargs):
         return super().__new__(cls, data=data, requires_grad=False)
 
     def __init__(self, data: torch.Tensor, weight_loader: Callable):
@@ -109,7 +108,7 @@ class BasevLLMParameter(Parameter):
     def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
         self._assert_and_load(loaded_weight)
 
-    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+    def _shard_id_as_int(self, shard_id: str | int) -> int:
         if isinstance(shard_id, int):
             return shard_id
 
@@ -290,7 +289,7 @@ class PerTensorScaleParameter(BasevLLMParameter):
         super().load_row_parallel_weight(*args, **kwargs)
 
     def _load_into_shard_id(
-        self, loaded_weight: torch.Tensor, shard_id: Union[str, int], **kwargs
+        self, loaded_weight: torch.Tensor, shard_id: str | int, **kwargs
     ):
         """
         Slice the parameter data based on the shard id for
@@ -320,10 +319,10 @@ class PackedColumnParameter(_ColumnvLLMParameter):
 
     def __init__(
         self,
-        packed_factor: Union[int, Fraction],
+        packed_factor: int | Fraction,
         packed_dim: int,
-        marlin_tile_size: Optional[int] = None,
-        bitblas_tile_size: Optional[int] = None,
+        marlin_tile_size: int | None = None,
+        bitblas_tile_size: int | None = None,
         **kwargs,
     ):
         self._packed_factor = packed_factor
@@ -371,10 +370,10 @@ class PackedvLLMParameter(ModelWeightParameter):
 
     def __init__(
         self,
-        packed_factor: Union[int, Fraction],
+        packed_factor: int | Fraction,
         packed_dim: int,
-        marlin_tile_size: Optional[int] = None,
-        bitblas_tile_size: Optional[int] = None,
+        marlin_tile_size: int | None = None,
+        bitblas_tile_size: int | None = None,
         **kwargs,
     ):
         self._packed_factor = packed_factor
@@ -437,7 +436,7 @@ class SharedWeightParameter(BasevLLMParameter):
     local_tensors: set[torch.Tensor]
 
     # dictionary mapping partition indices to associated parameters
-    partitions: dict[int, Union[ModelWeightParameter, Parameter]]
+    partitions: dict[int, ModelWeightParameter | Parameter]
 
     def __new__(cls, **kwargs):
         return super().__new__(cls, data=None, **kwargs)
@@ -547,7 +546,7 @@ class SharedWeightParameter(BasevLLMParameter):
         self,
         param: BasevLLMParameter,
         loaded_weight: torch.Tensor,
-        loaded_weight_shard_id: Optional[Union[str, int]],
+        loaded_weight_shard_id: str | int | None,
     ):
         raise ValueError(
             "When loading partition weights of "
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 4abd2625f8066..38cd230082f8e 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -3,7 +3,7 @@
 """Utils for model executor."""
 
 import copy
-from typing import Any, Optional
+from typing import Any
 
 import torch
 
@@ -16,7 +16,7 @@ def set_random_seed(seed: int) -> None:
 
 def set_weight_attrs(
     weight: torch.Tensor,
-    weight_attrs: Optional[dict[str, Any]],
+    weight_attrs: dict[str, Any] | None,
 ):
     """Set attributes on a weight tensor.
 
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index d81354d9a399e..a483837d4fb6c 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -3,7 +3,7 @@
 import base64
 from io import BytesIO
 from pathlib import Path
-from typing import Literal, Optional
+from typing import Literal
 
 import numpy as np
 import numpy.typing as npt
@@ -53,7 +53,7 @@ class AudioResampler:
 
     def __init__(
         self,
-        target_sr: Optional[float] = None,
+        target_sr: float | None = None,
         method: Literal["librosa", "scipy"] = "librosa",
     ):
         self.target_sr = target_sr
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 8b72bbe56eafd..f6ef675aa7c29 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -5,10 +5,10 @@ import sys
 from abc import ABC, abstractmethod
 from collections.abc import Mapping, Sequence
 from multiprocessing.synchronize import Lock as LockType
-from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union, cast
+from typing import TYPE_CHECKING, Generic, TypeAlias, TypeVar, cast
 
 import torch
-from typing_extensions import TypeAlias, override
+from typing_extensions import override
 
 from vllm.distributed.device_communicators.shm_object_storage import (
     MsgpackSerde,
@@ -85,14 +85,14 @@ class MultiModalProcessorCacheItemMetadata:
         self.prompt_updates = prompt_updates
 
 
-MultiModalCacheValue = Union[
-    MultiModalProcessorCacheItem,
-    MultiModalProcessorCacheItemMetadata,
-    MultiModalKwargsItems,
-    MultiModalKwargsItem,
-    MultiModalKwargs,
-    Mapping[str, NestedTensors],
-]
+MultiModalCacheValue: TypeAlias = (
+    MultiModalProcessorCacheItem
+    | MultiModalProcessorCacheItemMetadata
+    | MultiModalKwargsItems
+    | MultiModalKwargsItem
+    | MultiModalKwargs
+    | Mapping[str, NestedTensors]
+)
 
 _V = TypeVar("_V", bound=MultiModalCacheValue)
 
@@ -256,13 +256,13 @@ class BaseMultiModalCache(ABC, Generic[_I, _O]):
         raise NotImplementedError
 
 
-MultiModalProcessorCacheInItem: TypeAlias = Optional[
-    tuple[MultiModalKwargsItem, Sequence["ResolvedPromptUpdate"]]
-]
+MultiModalProcessorCacheInItem: TypeAlias = (
+    tuple[MultiModalKwargsItem, Sequence["ResolvedPromptUpdate"]] | None
+)
 
 
 MultiModalProcessorCacheOutItem: TypeAlias = tuple[
-    Optional[MultiModalKwargsItem], Sequence["ResolvedPromptUpdate"]
+    MultiModalKwargsItem | None, Sequence["ResolvedPromptUpdate"]
 ]
 
 
@@ -575,7 +575,7 @@ def _enable_mm_input_shm_cache(vllm_config: "VllmConfig") -> bool:
 def processor_cache_from_config(
     vllm_config: "VllmConfig",
     mm_registry: "MultiModalRegistry",
-) -> Optional[BaseMultiModalProcessorCache]:
+) -> BaseMultiModalProcessorCache | None:
     """Return a `BaseMultiModalProcessorCache`, if enabled."""
     model_config = vllm_config.model_config
 
@@ -602,7 +602,7 @@ def processor_only_cache_from_config(
 
 
 class BaseMultiModalReceiverCache(
-    BaseMultiModalCache[Optional[MultiModalKwargsItem], MultiModalKwargsItem]
+    BaseMultiModalCache[MultiModalKwargsItem | None, MultiModalKwargsItem]
 ):
     """The required interface for caches on P1."""
 
@@ -640,7 +640,7 @@ class MultiModalReceiverCache(BaseMultiModalReceiverCache):
     @override
     def get_and_update_item(
         self,
-        mm_item: Optional[MultiModalKwargsItem],
+        mm_item: MultiModalKwargsItem | None,
         mm_hash: str,
     ) -> MultiModalKwargsItem:
         if (cached_item := self._cache.get(mm_hash)) is not None:
@@ -692,7 +692,7 @@ class ShmObjectStoreReceiverCache(BaseMultiModalReceiverCache):
     @override
     def get_and_update_item(
         self,
-        mm_item: Optional[MultiModalKwargsItem],
+        mm_item: MultiModalKwargsItem | None,
         mm_hash: str,
     ) -> MultiModalKwargsItem:
         assert mm_item is not None, f"Expected an address item for {mm_hash=}"
@@ -711,7 +711,7 @@ class ShmObjectStoreReceiverCache(BaseMultiModalReceiverCache):
 def engine_receiver_cache_from_config(
     vllm_config: "VllmConfig",
     mm_registry: "MultiModalRegistry",
-) -> Optional[BaseMultiModalReceiverCache]:
+) -> BaseMultiModalReceiverCache | None:
     """
     This is used in the engine process.
     Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and
@@ -735,7 +735,7 @@ def worker_receiver_cache_from_config(
     vllm_config: "VllmConfig",
     mm_registry: "MultiModalRegistry",
     shared_worker_lock: LockType,
-) -> Optional[BaseMultiModalReceiverCache]:
+) -> BaseMultiModalReceiverCache | None:
     """
     This is used in the worker process.
     Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and
diff --git a/vllm/multimodal/evs.py b/vllm/multimodal/evs.py
index 36518c6bdb55a..4a288d2d238c2 100644
--- a/vllm/multimodal/evs.py
+++ b/vllm/multimodal/evs.py
@@ -9,7 +9,6 @@
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 
 import typing
-from typing import Union
 
 import torch
 
@@ -38,7 +37,7 @@ def compute_retained_tokens_count(
 
 def compute_retention_mask(
     video_embeds: torch.Tensor,
-    video_size_thw: Union[torch.LongTensor, tuple[int, int, int]],
+    video_size_thw: torch.LongTensor | tuple[int, int, int],
     spatial_merge_size: int,
     q: float,
 ) -> torch.Tensor:
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 91d86cd9a1897..d0dcbb25fcce8 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -4,7 +4,6 @@
 import pickle
 import uuid
 from collections.abc import Iterable
-from typing import Union
 
 import numpy as np
 import torch
@@ -18,7 +17,7 @@ logger = init_logger(__name__)
 
 class MultiModalHasher:
     @classmethod
-    def serialize_item(cls, obj: object) -> Iterable[Union[bytes, memoryview]]:
+    def serialize_item(cls, obj: object) -> Iterable[bytes | memoryview]:
         # Simple cases
         if isinstance(obj, (bytes, memoryview)):
             return (obj,)
@@ -84,7 +83,7 @@ class MultiModalHasher:
         cls,
         key: str,
         obj: object,
-    ) -> Iterable[Union[bytes, memoryview]]:
+    ) -> Iterable[bytes | memoryview]:
         # Recursive cases
         if isinstance(obj, (list, tuple)):
             for i, elem in enumerate(obj):
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index f50ab1faebbad..21e8bef97a787 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -3,7 +3,6 @@
 
 from io import BytesIO
 from pathlib import Path
-from typing import Union
 
 import pybase64
 import torch
@@ -26,7 +25,7 @@ def rescale_image_size(
 
 def rgba_to_rgb(
     image: Image.Image,
-    background_color: Union[tuple[int, int, int], list[int]] = (255, 255, 255),
+    background_color: tuple[int, int, int] | list[int] = (255, 255, 255),
 ) -> Image.Image:
     """Convert an RGBA image to RGB with filled background color."""
     assert image.mode == "RGBA"
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index bec3099a99bc5..dec2e0acab6bd 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -7,10 +7,20 @@ from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
 from functools import partial
 from itertools import accumulate
-from typing import TYPE_CHECKING, Any, Literal, Optional, TypedDict, Union, cast, final
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    Optional,
+    TypeAlias,
+    TypedDict,
+    Union,
+    cast,
+    final,
+)
 
 import numpy as np
-from typing_extensions import NotRequired, TypeAlias, TypeVar, deprecated
+from typing_extensions import NotRequired, TypeVar, deprecated
 
 from vllm.utils import LazyLoader, full_groupby, is_list_of
 from vllm.utils.jsontree import json_map_leaves
@@ -85,7 +95,7 @@ which are treated as audio embeddings;
 these are directly passed to the model without HF processing.
 """
 
-ModalityData: TypeAlias = Union[_T, list[Optional[_T]], None]
+ModalityData: TypeAlias = _T | list[_T | None] | None
 """
 Either a single data item, or a list of data items. Can only be None if UUID
 is provided.
@@ -117,7 +127,7 @@ The built-in modalities are defined by
 [`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins].
 """
 
-MultiModalUUIDDict: TypeAlias = Mapping[str, Union[list[Optional[str]], str]]
+MultiModalUUIDDict: TypeAlias = Mapping[str, list[str | None] | str]
 """
 A dictionary containing user-provided UUIDs for items in each modality.
 If a UUID for an item is not provided, its entry will be `None` and
@@ -412,7 +422,7 @@ class MultiModalFlatField(BaseMultiModalField):
         [`MultiModalFieldConfig.flat_from_sizes`][vllm.multimodal.inputs.MultiModalFieldConfig.flat_from_sizes]
     """
 
-    slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
+    slices: Sequence[slice] | Sequence[Sequence[slice]]
     dim: int = 0
 
     def build_elems(
@@ -524,7 +534,7 @@ class MultiModalFieldConfig:
     @staticmethod
     def flat(
         modality: str,
-        slices: Union[Sequence[slice], Sequence[Sequence[slice]]],
+        slices: Sequence[slice] | Sequence[Sequence[slice]],
         dim: int = 0,
     ):
         """
@@ -729,7 +739,7 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
 _I = TypeVar(
     "_I",
     MultiModalKwargsItem,
-    Optional[MultiModalKwargsItem],
+    MultiModalKwargsItem | None,
     default=MultiModalKwargsItem,
 )
 
@@ -818,10 +828,10 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
         )
 
 
-MultiModalKwargsOptionalItems: TypeAlias = Union[
-    MultiModalKwargsItems[MultiModalKwargsItem],
-    MultiModalKwargsItems[Optional[MultiModalKwargsItem]],
-]
+MultiModalKwargsOptionalItems: TypeAlias = (
+    MultiModalKwargsItems[MultiModalKwargsItem]
+    | MultiModalKwargsItems[MultiModalKwargsItem | None]
+)
 
 
 class MultiModalKwargs(UserDict[str, NestedTensors]):
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 8fdc5cf721d08..7483553095219 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -10,14 +10,14 @@ from typing import (
     Generic,
     Literal,
     NamedTuple,
-    Optional,
+    TypeAlias,
+    TypeGuard,
     TypeVar,
-    Union,
 )
 
 import numpy as np
 import torch
-from typing_extensions import TypeAlias, TypeGuard, assert_never
+from typing_extensions import assert_never
 
 from vllm.utils import LazyLoader, is_list_of
 
@@ -111,7 +111,7 @@ class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
 
 
 class EmbeddingItems(
-    ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]], torch.Tensor]
+    ModalityDataItems[torch.Tensor | list[torch.Tensor], torch.Tensor]
 ):
     """
     Base class for data items that are expressed as a batched embedding tensor,
@@ -195,7 +195,7 @@ class DictEmbeddingItems(
 
 
 class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
-    def __init__(self, data: Optional[Sequence[HfAudioItem]]) -> None:
+    def __init__(self, data: Sequence[HfAudioItem] | None) -> None:
         if data is None:
             data = [None]
         super().__init__(data, "audio")
@@ -206,7 +206,7 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
 
 
 class AudioEmbeddingItems(EmbeddingItems):
-    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
+    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
         super().__init__(data, "audio")
 
 
@@ -216,7 +216,7 @@ class ImageSize(NamedTuple):
 
 
 class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
-    def __init__(self, data: Optional[Sequence[HfImageItem]]) -> None:
+    def __init__(self, data: Sequence[HfImageItem] | None) -> None:
         if data is None:
             data = [None]
         super().__init__(data, "image")
@@ -234,17 +234,15 @@ class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):
 
 
 class ImageEmbeddingItems(EmbeddingItems):
-    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
+    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
         super().__init__(data, "image")
 
 
 class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
     def __init__(
         self,
-        data: Optional[Sequence[HfVideoItem]],
-        metadata: Optional[
-            Union[dict[str, Any], list[Optional[dict[str, Any]]]]
-        ] = None,
+        data: Sequence[HfVideoItem] | None,
+        metadata: dict[str, Any] | list[dict[str, Any] | None] | None = None,
     ) -> None:
         if data is None:
             data = [None]
@@ -267,7 +265,7 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):
 
 
 class VideoEmbeddingItems(EmbeddingItems):
-    def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
+    def __init__(self, data: torch.Tensor | list[torch.Tensor]) -> None:
         super().__init__(data, "video")
 
 
@@ -306,7 +304,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
     def get_items(
         self,
         modality: str,
-        typ: Union[type[_D], tuple[type[_D], ...]],
+        typ: type[_D] | tuple[type[_D], ...],
     ) -> _D:
         """
         Get the data items belonging to a modality,
@@ -331,7 +329,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
 
 
 ModalityDataParser: TypeAlias = Callable[
-    [ModalityData[Any]], Optional[ModalityDataItems[Any, Any]]
+    [ModalityData[Any]], ModalityDataItems[Any, Any] | None
 ]
 
 
@@ -348,7 +346,7 @@ class MultiModalDataParser:
     def __init__(
         self,
         *,
-        target_sr: Optional[float] = None,
+        target_sr: float | None = None,
         audio_resample_method: Literal["librosa", "scipy"] = "librosa",
         video_needs_metadata: bool = False,
     ) -> None:
@@ -362,7 +360,7 @@ class MultiModalDataParser:
 
     def _is_embeddings(
         self, data: object
-    ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
+    ) -> TypeGuard[torch.Tensor | list[torch.Tensor]]:
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
         if is_list_of(data, torch.Tensor):
@@ -381,7 +379,7 @@ class MultiModalDataParser:
     def _get_audio_with_sr(
         self,
         audio: AudioItem,
-    ) -> tuple[np.ndarray, Optional[float]]:
+    ) -> tuple[np.ndarray, float | None]:
         if isinstance(audio, tuple):
             return audio
         if isinstance(audio, list):
@@ -396,7 +394,7 @@ class MultiModalDataParser:
     def _get_video_with_metadata(
         self,
         video: VideoItem,
-    ) -> tuple[np.ndarray, Optional[dict[str, Any]]]:
+    ) -> tuple[np.ndarray, dict[str, Any] | None]:
         if isinstance(video, tuple):
             return video
         if isinstance(video, list):
@@ -411,7 +409,7 @@ class MultiModalDataParser:
     def _parse_audio_data(
         self,
         data: ModalityData[AudioItem],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if data is None:
             return AudioProcessorItems(None)
 
@@ -451,7 +449,7 @@ class MultiModalDataParser:
     def _parse_image_data(
         self,
         data: ModalityData[ImageItem],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if data is None:
             return ImageProcessorItems(None)
 
@@ -477,7 +475,7 @@ class MultiModalDataParser:
     def _parse_video_data(
         self,
         data: ModalityData[VideoItem],
-    ) -> Optional[ModalityDataItems[Any, Any]]:
+    ) -> ModalityDataItems[Any, Any] | None:
         if data is None:
             return VideoProcessorItems(None)
 
@@ -500,8 +498,8 @@ class MultiModalDataParser:
         else:
             data_items = data
 
-        new_videos = list[tuple[np.ndarray, Optional[dict[str, Any]]]]()
-        metadata_lst: list[Optional[dict[str, Any]]] = []
+        new_videos = list[tuple[np.ndarray, dict[str, Any] | None]]()
+        metadata_lst: list[dict[str, Any] | None] = []
         for data_item in data_items:
             video, metadata = self._get_video_with_metadata(data_item)
             if self.video_needs_metadata:
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 5c3739e29d101..5d9876539499d 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -12,9 +12,8 @@ from typing import (
     Any,
     Generic,
     NamedTuple,
-    Optional,
     Protocol,
-    Union,
+    TypeAlias,
     cast,
     overload,
 )
@@ -57,12 +56,20 @@ if TYPE_CHECKING:
 
     from .cache import BaseMultiModalProcessorCache
     from .profiling import BaseDummyInputsBuilder
+else:
+    PretrainedConfig = object
+    BatchFeature = object
+    ProcessorMixin = object
+
+    ModelConfig = object
+
+    BaseMultiModalProcessorCache = object
 
 logger = init_logger(__name__)
 
 _S = TypeVar("_S", str, list[int])
 
-PromptSeq = Union[str, list[int]]
+PromptSeq: TypeAlias = str | list[int]
 """A token sequence (list of token IDs) or text."""
 
 
@@ -71,7 +78,7 @@ def _cached_encode(
     tokenizer: AnyTokenizer,
     text: str,
     *,
-    add_special_tokens: Optional[bool] = None,
+    add_special_tokens: bool | None = None,
 ) -> list[int]:
     return encode_tokens(tokenizer, text, add_special_tokens=add_special_tokens)
 
@@ -81,7 +88,7 @@ def _cached_decode(
     tokenizer: AnyTokenizer,
     token_ids: tuple[int, ...],
     *,
-    skip_special_tokens: Optional[bool] = None,
+    skip_special_tokens: bool | None = None,
 ) -> str:
     return decode_tokens(
         tokenizer, list(token_ids), skip_special_tokens=skip_special_tokens
@@ -108,7 +115,7 @@ class _GetMatchIndex(Protocol):
         tokenizer: AnyTokenizer,
         prompt: PromptSeq,
         start_idx: int = 0,
-    ) -> Optional[int]: ...
+    ) -> int | None: ...
 
 
 @dataclass
@@ -138,7 +145,7 @@ class PromptIndexTargets:
             tokenizer: AnyTokenizer,
             prompt: PromptSeq,
             start_idx: int = 0,
-        ) -> Optional[int]:
+        ) -> int | None:
             if start_idx != 0:
                 return None
 
@@ -168,12 +175,12 @@ class PromptIndexTargets:
         return PromptIndex(lambda tokenizer, prompt, start_idx=0: len(prompt))
 
 
-UpdateTarget = Union[PromptSeq, PromptIndex]
+UpdateTarget: TypeAlias = PromptSeq | PromptIndex
 """
 The token sequence or text to update.
 """
 
-PromptUpdateTarget = Union[Callable[[int], UpdateTarget], UpdateTarget]
+PromptUpdateTarget: TypeAlias = Callable[[int], UpdateTarget] | UpdateTarget
 """
 Given the index of the processed item within
 [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
@@ -191,7 +198,7 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    is_embed: Optional[Callable[[AnyTokenizer, PromptSeq], torch.Tensor]] = None
+    is_embed: Callable[[AnyTokenizer, PromptSeq], torch.Tensor] | None = None
     """
     Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
     return a boolean mask of shape `(len(full),)` indicating which positions
@@ -236,7 +243,7 @@ class PromptUpdateDetails(Generic[_S]):
         return PromptUpdateDetails(full=seq, is_embed=is_embed)
 
 
-PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
+PromptUpdateInfo: TypeAlias = PromptSeq | PromptUpdateDetails
 """
 The token sequence or text that are part of the update.
 
@@ -245,7 +252,7 @@ use [`PromptUpdateDetails`][vllm.multimodal.processing.PromptUpdateDetails] to
 specify which part.
 """
 
-PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo], PromptUpdateInfo]
+PromptUpdateContent: TypeAlias = Callable[[int], PromptUpdateInfo] | PromptUpdateInfo
 """
 Given the index of the processed item within
 [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
@@ -472,7 +479,7 @@ class _HasModalityProp(Protocol):
     def modality(self) -> str: ...
 
 
-_M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp])
+_M = TypeVar("_M", bound=_HasModalityAttr | _HasModalityProp)
 
 
 def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
@@ -554,7 +561,7 @@ class ResolvedPromptUpdate:
 
     def iter_matches(
         self,
-        prompt: Union[list[int], str],
+        prompt: list[int] | str,
         tokenizer: AnyTokenizer,
         *,
         start_idx: int = 0,
@@ -642,7 +649,7 @@ class PlaceholderFeaturesInfo:
     item_idx: int
     start_idx: int
     tokens: list[int]
-    is_embed: Optional[torch.Tensor]
+    is_embed: torch.Tensor | None
 
     @property
     def length(self) -> int:
@@ -668,8 +675,8 @@ def _find_matches(
     *,
     prev_end_idx: int = 0,
     current_result: "MultiModalPromptUpdatesApplyResult",
-) -> tuple[Optional[UpdateMode], list[_MatchToApply]]:
-    mode: Optional[UpdateMode] = None
+) -> tuple[UpdateMode | None, list[_MatchToApply]]:
+    mode: UpdateMode | None = None
     mm_matches = dict[tuple[str, int], tuple[PromptTargetMatch, int]]()
 
     for modality, modality_updates in mm_prompt_updates.items():
@@ -723,7 +730,7 @@ def _apply_matches(
 ) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
     prompt_len = len(prompt)
 
-    out_seqs = list[Union[str, list[int]]]()
+    out_seqs = list[str | list[int]]()
     out_result: MultiModalPromptUpdatesApplyResult = {
         m: [None] * len(items) for m, items in mm_prompt_updates.items()
     }
@@ -880,8 +887,8 @@ def find_mm_placeholders(
 
 
 _T = TypeVar("_T")
-_C = TypeVar("_C", bound="PretrainedConfig", default="PretrainedConfig")
-_P = TypeVar("_P", bound="ProcessorMixin", default="ProcessorMixin")
+_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
+_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
 
 
 @dataclass(frozen=True)
@@ -891,25 +898,25 @@ class InputProcessingContext:
     modify the inputs.
     """
 
-    model_config: "ModelConfig"
+    model_config: ModelConfig
     """The configuration of the model."""
 
     tokenizer: AnyTokenizer
     """The tokenizer used to tokenize the inputs."""
 
     @overload
-    def get_hf_config(self, /) -> "PretrainedConfig": ...
+    def get_hf_config(self, /) -> PretrainedConfig: ...
 
     @overload
     def get_hf_config(
         self,
-        typ: Union[type[_C], tuple[type[_C], ...]],
+        typ: type[_C] | tuple[type[_C], ...],
         /,
     ) -> _C: ...
 
     def get_hf_config(
         self,
-        typ: Optional[Union[type[Any], tuple[type[Any], ...]]] = None,
+        typ: type[Any] | tuple[type[Any], ...] | None = None,
         /,
     ) -> Any:
         """
@@ -955,19 +962,19 @@ class InputProcessingContext:
         return mm_config
 
     @overload
-    def get_hf_processor(self, /, **kwargs: object) -> "ProcessorMixin": ...
+    def get_hf_processor(self, /, **kwargs: object) -> ProcessorMixin: ...
 
     @overload
     def get_hf_processor(
         self,
-        typ: Union[type[_P], tuple[type[_P], ...]],
+        typ: type[_P] | tuple[type[_P], ...],
         /,
         **kwargs: object,
     ) -> _P: ...
 
     def get_hf_processor(
         self,
-        typ: Optional[Union[type[Any], tuple[type[Any], ...]]] = None,
+        typ: type[Any] | tuple[type[Any], ...] | None = None,
         /,
         **kwargs: object,
     ) -> Any:
@@ -1026,13 +1033,13 @@ class InputProcessingContext:
 
     def call_hf_processor(
         self,
-        hf_processor: "ProcessorMixin",
+        hf_processor: ProcessorMixin,
         data: Mapping[str, object],
         kwargs: Mapping[str, object] = {},
         *,
         num_tries: int = 1,
         max_tries: int = 5,
-    ) -> Union["BatchFeature", JSONTree]:
+    ) -> BatchFeature | JSONTree:
         """
         Call `hf_processor` on the prompt `data`
         (text, image, audio...) with configurable options `kwargs`.
@@ -1113,10 +1120,10 @@ class BaseProcessingInfo:
     def get_tokenizer(self) -> AnyTokenizer:
         return self.ctx.tokenizer
 
-    def get_hf_config(self) -> "PretrainedConfig":
+    def get_hf_config(self) -> PretrainedConfig:
         return self.ctx.get_hf_config()
 
-    def get_hf_processor(self, **kwargs: object) -> "ProcessorMixin":
+    def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
         """
         Subclasses can override this method to handle
         specific kwargs from model config or user inputs.
@@ -1124,7 +1131,7 @@ class BaseProcessingInfo:
         return self.ctx.get_hf_processor(**kwargs)
 
     @abstractmethod
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         """
         Return the maximum supported number of items for each modality.
 
@@ -1156,7 +1163,7 @@ class BaseProcessingInfo:
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> Optional[Mapping[str, int]]:
+    ) -> Mapping[str, int] | None:
         """
         Return the maximum number of tokens per item of for each modality.
 
@@ -1193,7 +1200,7 @@ A collection of prompt updates with a similar structure as
 [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
 """
 
-MultiModalPromptUpdatesApplyResult = Mapping[str, list[Optional[int]]]
+MultiModalPromptUpdatesApplyResult = Mapping[str, list[int | None]]
 """
 For an item `MultiModalPromptUpdates[k][i]`,
 `MultiModalPromptUpdatesApplyResult[k][i]` represents the index of the
@@ -1220,7 +1227,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         info: _I,
         dummy_inputs: "BaseDummyInputsBuilder[_I]",
         *,
-        cache: Optional["BaseMultiModalProcessorCache"] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
         super().__init__()
 
@@ -1248,7 +1255,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
         return self.apply(prompt, mm_data, hf_processor_mm_kwargs, mm_uuids=mm_uuids)
 
@@ -1305,7 +1312,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
     @abstractmethod
     def _get_mm_fields_config(
         self,
-        hf_inputs: "BatchFeature",
+        hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         """Given the HF-processed data, output the metadata of each field."""
@@ -1411,7 +1418,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
-    ) -> "BatchFeature":
+    ) -> BatchFeature:
         """
         Call the HF processor on the prompt text and
         associated multi-modal data.
@@ -1447,7 +1454,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-    ) -> tuple[list[int], "BatchFeature", bool]:
+    ) -> tuple[list[int], BatchFeature, bool]:
         """
         Apply the HF processor on the prompt text and multi-modal data
         together.
@@ -1518,7 +1525,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
-    ) -> "BatchFeature":
+    ) -> BatchFeature:
         """
         Apply the HF processor on the multi-modal data only.
 
@@ -1540,13 +1547,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _apply_hf_processor_main(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
         enable_hf_prompt_update: bool,
-    ) -> tuple[list[int], "BatchFeature", bool]:
+    ) -> tuple[list[int], BatchFeature, bool]:
         """
         Apply the HF processor on the prompt text and multi-modal data.
 
@@ -1585,7 +1592,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalHashes:
         """Create MM hashes to be returned.
 
@@ -1647,7 +1654,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _get_cache_missing_items(
         self,
-        cache: "BaseMultiModalProcessorCache",
+        cache: BaseMultiModalProcessorCache,
         mm_data_items: MultiModalDataItems,
         mm_hashes: MultiModalHashes,
     ) -> MultiModalDataItems:
@@ -1692,7 +1699,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _merge_mm_kwargs(
         self,
-        cache: "BaseMultiModalProcessorCache",
+        cache: BaseMultiModalProcessorCache,
         mm_hashes: MultiModalHashes,
         mm_missing_kwargs: MultiModalKwargsItems,
         mm_missing_prompt_updates: MultiModalPromptUpdates,
@@ -1705,7 +1712,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
         mm_missing_next_idx = defaultdict[str, int](lambda: 0)
 
-        merged_kwargs = defaultdict[str, list[Optional[MultiModalKwargsItem]]](list)
+        merged_kwargs = defaultdict[str, list[MultiModalKwargsItem | None]](list)
         merged_prompt_updates = defaultdict[str, list[Sequence[ResolvedPromptUpdate]]](
             list
         )
@@ -1714,7 +1721,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
             missing_prompt_updates = mm_missing_prompt_updates.get(modality, [])
 
             for item_idx, item_hash in enumerate(hashes):
-                kwargs: Optional[MultiModalKwargsItem]
+                kwargs: MultiModalKwargsItem | None
                 if not mm_is_cached[modality][item_idx]:
                     missing_next_idx = mm_missing_next_idx[modality]
                     kwargs = missing_kwargs[missing_next_idx]
@@ -1743,12 +1750,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _apply_hf_processor(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         (
             prompt_ids,
@@ -1791,12 +1798,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def _cached_apply_hf_processor(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         """
         Apply the HF processor on the full prompt text,
@@ -2026,12 +2033,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
 
     def apply(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        tokenization_kwargs: Mapping[str, object] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -2090,9 +2097,9 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
     @abstractmethod
     def create_encoder_prompt(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
-    ) -> Union[str, list[int]]:
+    ) -> str | list[int]:
         """
         Create input prompt for the encoder. HF processor will be applied on
         this prompt during profiling and generation.
@@ -2105,15 +2112,15 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
     def create_decoder_prompt(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
-    ) -> Union[str, list[int]]:
+    ) -> str | list[int]:
         """Create input prompt for the decoder."""
         return prompt
 
     def _get_enc_dec_inputs(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         encoder_inputs: MultiModalInputs,
     ):
@@ -2135,12 +2142,12 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
     def apply(
         self,
-        prompt: Union[str, list[int]],
+        prompt: str | list[int],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
-        tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        tokenization_kwargs: Mapping[str, object] | None = None,
         *,
-        mm_uuids: Optional[MultiModalUUIDDict] = None,
+        mm_uuids: MultiModalUUIDDict | None = None,
     ) -> MultiModalEncDecInputs:
         """
         Process multi-modal inputs to be used in vLLM.
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 05ba5a2abdd41..90b19961c6eb8 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -3,7 +3,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Generic, NamedTuple, Optional, TypeVar, Union, cast
+from typing import Generic, NamedTuple, TypeVar, cast
 
 import numpy as np
 import numpy.typing as npt
@@ -41,7 +41,7 @@ class ProcessorInputs:
     [`vllm.multimodal.processing.BaseMultiModalProcessor.apply`][].
     """
 
-    prompt: Union[str, list[int]]
+    prompt: str | list[int]
     mm_data: MultiModalDataDict
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
     tokenization_kwargs: Mapping[str, object] = field(default_factory=dict)
@@ -87,7 +87,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalDataDict:
         """
         Build the multimodal input which, after processing, results in
@@ -107,7 +107,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> ProcessorInputs:
         """
         Build the input which, after processing, results in
@@ -136,7 +136,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         *,
         length: int,
         num_audios: int,
-        overrides: Optional[AudioDummyOptions] = None,
+        overrides: AudioDummyOptions | None = None,
     ) -> list[npt.NDArray]:
         if num_audios == 0:
             return []
@@ -158,7 +158,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         width: int,
         height: int,
         num_images: int,
-        overrides: Optional[ImageDummyOptions] = None,
+        overrides: ImageDummyOptions | None = None,
     ) -> list[Image.Image]:
         if num_images == 0:
             return []
@@ -191,7 +191,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
         height: int,
         num_frames: int,
         num_videos: int,
-        overrides: Optional[VideoDummyOptions] = None,
+        overrides: VideoDummyOptions | None = None,
     ) -> list[npt.NDArray]:
         if num_videos == 0:
             return []
@@ -254,8 +254,8 @@ class MultiModalProfiler(Generic[_I]):
     def _get_dummy_mm_inputs(
         self,
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_counts: Mapping[str, int] | None = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> MultiModalInputs:
         if mm_counts is None:
             mm_counts = self.get_mm_limits()
@@ -290,8 +290,8 @@ class MultiModalProfiler(Generic[_I]):
     def get_encoder_dummy_data(
         self,
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_counts: Mapping[str, int] | None = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> DummyEncoderData:
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts, mm_options)
         mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
@@ -324,8 +324,8 @@ class MultiModalProfiler(Generic[_I]):
     def get_decoder_dummy_data(
         self,
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
-        mm_options: Optional[Mapping[str, BaseDummyOptions]] = None,
+        mm_counts: Mapping[str, int] | None = None,
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
     ) -> DummyDecoderData:
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts, mm_options)
 
@@ -344,7 +344,7 @@ class MultiModalProfiler(Generic[_I]):
     def _get_mm_max_tokens(
         self,
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
+        mm_counts: Mapping[str, int] | None = None,
         mm_embeddings_only: bool = True,
     ) -> Mapping[str, int]:
         if mm_counts is None:
@@ -363,7 +363,7 @@ class MultiModalProfiler(Generic[_I]):
     def get_mm_max_contiguous_tokens(
         self,
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
+        mm_counts: Mapping[str, int] | None = None,
     ):
         """
         Returns the maximum length of the multimodal (image placeholders+text)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index a526eaff715ac..66d0bb7458c07 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
+from typing import TYPE_CHECKING, Generic, Protocol, TypeVar
 
 import torch.nn as nn
 
@@ -69,7 +69,7 @@ class MultiModalProcessorFactory(Protocol[_I]):  # type: ignore[misc]
         info: _I,
         dummy_inputs: BaseDummyInputsBuilder[_I],
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> BaseMultiModalProcessor[_I]: ...
 
 
@@ -83,7 +83,7 @@ class _ProcessorFactories(Generic[_I]):
         self,
         ctx: InputProcessingContext,
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ):
         info = self.info(ctx)
         dummy_inputs_builder = self.dummy_inputs(info)
@@ -101,7 +101,7 @@ class MultiModalRegistry:
     def _extract_mm_options(
         self,
         model_config: "ModelConfig",
-    ) -> Optional[Mapping[str, BaseDummyOptions]]:
+    ) -> Mapping[str, BaseDummyOptions] | None:
         """
         Extract multimodal dummy options from model config.
 
@@ -151,7 +151,7 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of tokens per data item from each modality based
@@ -175,7 +175,7 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of tokens per data item from each modality based
@@ -202,7 +202,7 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of multi-modal input instances for each modality
@@ -259,7 +259,7 @@ class MultiModalRegistry:
     def _create_processing_ctx(
         self,
         model_config: "ModelConfig",
-        tokenizer: Optional[AnyTokenizer] = None,
+        tokenizer: AnyTokenizer | None = None,
     ) -> InputProcessingContext:
         if tokenizer is None and not model_config.skip_tokenizer_init:
             tokenizer = cached_tokenizer_from_config(model_config)
@@ -269,7 +269,7 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         *,
-        tokenizer: Optional[AnyTokenizer] = None,
+        tokenizer: AnyTokenizer | None = None,
     ) -> BaseProcessingInfo:
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]
@@ -280,8 +280,8 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         *,
-        tokenizer: Optional[AnyTokenizer] = None,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        tokenizer: AnyTokenizer | None = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
@@ -300,9 +300,9 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
+        mm_counts: Mapping[str, int] | None = None,
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> DummyDecoderData:
         """
         Create dummy data for profiling the memory usage of a model.
@@ -333,9 +333,9 @@ class MultiModalRegistry:
         self,
         model_config: "ModelConfig",
         seq_len: int,
-        mm_counts: Optional[Mapping[str, int]] = None,
+        mm_counts: Mapping[str, int] | None = None,
         *,
-        cache: Optional[BaseMultiModalProcessorCache] = None,
+        cache: BaseMultiModalProcessorCache | None = None,
     ) -> DummyEncoderData:
         """
         Create dummy data for profiling the memory usage of a model.
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index c9dc077d0385f..ecc1862c42f8c 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -7,7 +7,7 @@ from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, TypeVar
 from urllib.parse import ParseResult, urlparse
 from urllib.request import url2pathname
 
@@ -49,11 +49,11 @@ atexit.register(global_thread_pool.shutdown)
 class MediaConnector:
     def __init__(
         self,
-        media_io_kwargs: Optional[dict[str, dict[str, Any]]] = None,
+        media_io_kwargs: dict[str, dict[str, Any]] | None = None,
         connection: HTTPConnection = global_http_connection,
         *,
         allowed_local_media_path: str = "",
-        allowed_media_domains: Optional[list[str]] = None,
+        allowed_media_domains: list[str] | None = None,
     ) -> None:
         """
         Args:
@@ -143,7 +143,7 @@ class MediaConnector:
         url: str,
         media_io: MediaIO[_M],
         *,
-        fetch_timeout: Optional[int] = None,
+        fetch_timeout: int | None = None,
     ) -> _M:  # type: ignore[type-var]
         url_spec = urlparse(url)
 
@@ -173,7 +173,7 @@ class MediaConnector:
         url: str,
         media_io: MediaIO[_M],
         *,
-        fetch_timeout: Optional[int] = None,
+        fetch_timeout: int | None = None,
     ) -> _M:
         url_spec = urlparse(url)
         loop = asyncio.get_running_loop()
@@ -207,7 +207,7 @@ class MediaConnector:
     def fetch_audio(
         self,
         audio_url: str,
-    ) -> tuple[np.ndarray, Union[int, float]]:
+    ) -> tuple[np.ndarray, int | float]:
         """
         Load audio from a URL.
         """
@@ -222,7 +222,7 @@ class MediaConnector:
     async def fetch_audio_async(
         self,
         audio_url: str,
-    ) -> tuple[np.ndarray, Union[int, float]]:
+    ) -> tuple[np.ndarray, int | float]:
         """
         Asynchronously fetch audio from a URL.
         """
@@ -396,7 +396,7 @@ def group_mm_kwargs_by_modality(
     *,
     device: torch.types.Device = None,
     pin_memory: bool = False,
-    merge_by_field_config: Optional[bool] = None,
+    merge_by_field_config: bool | None = None,
 ) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
     """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
     modality together into the same `MultiModalKwargs` instance.
@@ -452,8 +452,8 @@ def group_mm_kwargs_by_modality(
 
 def fetch_audio(
     audio_url: str,
-    audio_io_kwargs: Optional[dict[str, Any]] = None,
-) -> tuple[np.ndarray, Union[int, float]]:
+    audio_io_kwargs: dict[str, Any] | None = None,
+) -> tuple[np.ndarray, int | float]:
     """
     Args:
         audio_url: URL of the audio file to fetch.
@@ -466,7 +466,7 @@ def fetch_audio(
 
 def fetch_image(
     image_url: str,
-    image_io_kwargs: Optional[dict[str, Any]] = None,
+    image_io_kwargs: dict[str, Any] | None = None,
 ) -> Image.Image:
     """
     Args:
@@ -480,7 +480,7 @@ def fetch_image(
 
 def fetch_video(
     video_url: str,
-    video_io_kwargs: Optional[dict[str, Any]] = None,
+    video_io_kwargs: dict[str, Any] | None = None,
 ) -> tuple[npt.NDArray, dict[str, Any]]:
     """
     Args:
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 400d6a6be9bee..3f9c0460ba08e 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -6,7 +6,7 @@ from abc import abstractmethod
 from functools import partial
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 
 import numpy as np
 import numpy.typing as npt
@@ -192,7 +192,7 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend):
 
         # Refer to:
         # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
-        frame_indices: Union[range, list[int]]
+        frame_indices: range | list[int]
         if duration <= max_duration:
             n = int(math.floor(duration * fps))
             frame_indices = sorted(
diff --git a/vllm/outputs.py b/vllm/outputs.py
index dc183bd8dbe93..114c1c5dc4b03 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -4,7 +4,7 @@
 from collections.abc import MutableSequence
 from collections.abc import Sequence as GenericSequence
 from dataclasses import dataclass
-from typing import Any, Generic, Optional, Union
+from typing import Any, Generic
 
 import torch
 from typing_extensions import TypeVar
@@ -41,11 +41,11 @@ class CompletionOutput:
     index: int
     text: str
     token_ids: GenericSequence[int]
-    cumulative_logprob: Optional[float]
-    logprobs: Optional[SampleLogprobs]
-    finish_reason: Optional[str] = None
-    stop_reason: Union[int, str, None] = None
-    lora_request: Optional[LoRARequest] = None
+    cumulative_logprob: float | None
+    logprobs: SampleLogprobs | None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
+    lora_request: LoRARequest | None = None
 
     def finished(self) -> bool:
         return self.finish_reason is not None
@@ -108,19 +108,19 @@ class RequestOutput:
     def __init__(
         self,
         request_id: str,
-        prompt: Optional[str],
-        prompt_token_ids: Optional[list[int]],
-        prompt_logprobs: Optional[PromptLogprobs],
+        prompt: str | None,
+        prompt_token_ids: list[int] | None,
+        prompt_logprobs: PromptLogprobs | None,
         outputs: list[CompletionOutput],
         finished: bool,
-        metrics: Optional[Union[RequestMetrics, RequestStateStats]] = None,
-        lora_request: Optional[LoRARequest] = None,
-        encoder_prompt: Optional[str] = None,
-        encoder_prompt_token_ids: Optional[list[int]] = None,
-        num_cached_tokens: Optional[int] = None,
+        metrics: RequestMetrics | RequestStateStats | None = None,
+        lora_request: LoRARequest | None = None,
+        encoder_prompt: str | None = None,
+        encoder_prompt_token_ids: list[int] | None = None,
+        num_cached_tokens: int | None = None,
         *,
-        multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
-        kv_transfer_params: Optional[dict[str, Any]] = None,
+        multi_modal_placeholders: MultiModalPlaceholderDict | None = None,
+        kv_transfer_params: dict[str, Any] | None = None,
         # Forward compatibility, code that uses args added in new release can
         # still run with older versions of vLLM without breaking.
         **kwargs: Any,
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index d1708ad5c7517..d63ef78f5b2d2 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -3,7 +3,7 @@
 import logging
 import traceback
 from itertools import chain
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 from vllm import envs
 from vllm.plugins import load_plugins_by_group
@@ -31,7 +31,7 @@ def vllm_version_matches_substr(substr: str) -> bool:
     return substr in vllm_version
 
 
-def tpu_platform_plugin() -> Optional[str]:
+def tpu_platform_plugin() -> str | None:
     logger.debug("Checking if TPU platform is available.")
 
     # Check for Pathways TPU proxy
@@ -55,7 +55,7 @@ def tpu_platform_plugin() -> Optional[str]:
         return None
 
 
-def cuda_platform_plugin() -> Optional[str]:
+def cuda_platform_plugin() -> str | None:
     is_cuda = False
     logger.debug("Checking if CUDA platform is available.")
     try:
@@ -106,7 +106,7 @@ def cuda_platform_plugin() -> Optional[str]:
     return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
 
 
-def rocm_platform_plugin() -> Optional[str]:
+def rocm_platform_plugin() -> str | None:
     is_rocm = False
     logger.debug("Checking if ROCm platform is available.")
     try:
@@ -127,7 +127,7 @@ def rocm_platform_plugin() -> Optional[str]:
     return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
 
 
-def xpu_platform_plugin() -> Optional[str]:
+def xpu_platform_plugin() -> str | None:
     is_xpu = False
     logger.debug("Checking if XPU platform is available.")
     try:
@@ -154,7 +154,7 @@ def xpu_platform_plugin() -> Optional[str]:
     return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
 
 
-def cpu_platform_plugin() -> Optional[str]:
+def cpu_platform_plugin() -> str | None:
     is_cpu = False
     logger.debug("Checking if CPU platform is available.")
     try:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 49c953fd36ee0..ed6724b298a53 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -4,13 +4,13 @@
 import json
 import os
 import platform
-import re
 import subprocess
 import sys
 from dataclasses import dataclass
 from importlib.util import find_spec
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
+import regex as re
 import torch
 
 from vllm.logger import init_logger
@@ -128,7 +128,7 @@ class CpuPlatform(Platform):
         selected_backend: "_Backend",
         head_size: int,
         dtype: torch.dtype,
-        kv_cache_dtype: Optional[str],
+        kv_cache_dtype: str | None,
         block_size: int,
         use_v1: bool,
         use_mla: bool,
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index e0f832b431147..b51421b6a32d3 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -5,9 +5,10 @@ pynvml. However, it should not initialize cuda context.
 """
 
 import os
+from collections.abc import Callable
 from datetime import timedelta
 from functools import cache, wraps
-from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, TypeVar
 
 import torch
 from torch.distributed import PrefixStore, ProcessGroup
@@ -85,7 +86,7 @@ class CudaPlatformBase(Platform):
         _ = torch.zeros(1, device=device)
 
     @classmethod
-    def get_device_capability(cls, device_id: int = 0) -> Optional[DeviceCapability]:
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None:
         raise NotImplementedError
 
     @classmethod
@@ -210,7 +211,7 @@ class CudaPlatformBase(Platform):
 
     @classmethod
     def get_current_memory_usage(
-        cls, device: Optional[torch.types.Device] = None
+        cls, device: torch.types.Device | None = None
     ) -> float:
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats(device)
@@ -594,7 +595,7 @@ class NvmlCudaPlatform(CudaPlatformBase):
     @classmethod
     @cache
     @with_nvml_context
-    def get_device_capability(cls, device_id: int = 0) -> Optional[DeviceCapability]:
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None:
         try:
             physical_device_id = cls.device_id_to_physical_device_id(device_id)
             handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
@@ -607,7 +608,7 @@ class NvmlCudaPlatform(CudaPlatformBase):
     @with_nvml_context
     def has_device_capability(
         cls,
-        capability: Union[tuple[int, int], int],
+        capability: tuple[int, int] | int,
         device_id: int = 0,
     ) -> bool:
         try:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index e372ebf0cb3f7..9b8d75ac22fe0 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -8,7 +8,7 @@ import random
 import sys
 from datetime import timedelta
 from platform import uname
-from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
+from typing import TYPE_CHECKING, Any, NamedTuple
 
 import numpy as np
 import torch
@@ -20,18 +20,16 @@ from vllm.logger import init_logger
 if TYPE_CHECKING:
     from vllm.attention.backends.registry import _Backend
     from vllm.config import ModelConfig, VllmConfig
-    from vllm.lora.request import LoRARequest
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
     from vllm.utils import FlexibleArgumentParser
 else:
-    _Backend = None
-    ModelConfig = None
-    VllmConfig = None
-    LoRARequest = None
-    PoolingParams = None
-    SamplingParams = None
-    FlexibleArgumentParser = None
+    _Backend = object
+    ModelConfig = object
+    VllmConfig = object
+    PoolingParams = object
+    SamplingParams = object
+    FlexibleArgumentParser = object
 
 logger = init_logger(__name__)
 
@@ -113,7 +111,7 @@ class Platform:
 
     additional_env_vars: list[str] = []
 
-    _global_graph_pool: Optional[Any] = None
+    _global_graph_pool: Any | None = None
 
     @property
     def supported_dtypes(self) -> list[torch.dtype]:
@@ -180,7 +178,7 @@ class Platform:
             import vllm._moe_C  # noqa: F401
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> "_Backend":
+    def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> _Backend:
         from vllm.attention.backends.registry import _Backend
 
         return _Backend.TORCH_SDPA
@@ -188,10 +186,10 @@ class Platform:
     @classmethod
     def get_attn_backend_cls(
         cls,
-        selected_backend: "_Backend",
+        selected_backend: _Backend,
         head_size: int,
         dtype: torch.dtype,
-        kv_cache_dtype: Optional[str],
+        kv_cache_dtype: str | None,
         block_size: int,
         use_v1: bool,
         use_mla: bool,
@@ -205,14 +203,14 @@ class Platform:
     def get_device_capability(
         cls,
         device_id: int = 0,
-    ) -> Optional[DeviceCapability]:
+    ) -> DeviceCapability | None:
         """Stateless version of [torch.cuda.get_device_capability][]."""
         return None
 
     @classmethod
     def has_device_capability(
         cls,
-        capability: Union[tuple[int, int], int],
+        capability: tuple[int, int] | int,
         device_id: int = 0,
     ) -> bool:
         """
@@ -236,7 +234,7 @@ class Platform:
     @classmethod
     def is_device_capability(
         cls,
-        capability: Union[tuple[int, int], int],
+        capability: tuple[int, int] | int,
         device_id: int = 0,
     ) -> bool:
         """
@@ -283,7 +281,7 @@ class Platform:
         return torch.inference_mode(mode=True)
 
     @classmethod
-    def seed_everything(cls, seed: Optional[int] = None) -> None:
+    def seed_everything(cls, seed: int | None = None) -> None:
         """
         Set the seed of each random module.
         `torch.manual_seed` will set seed on all devices.
@@ -304,7 +302,7 @@ class Platform:
 
     @classmethod
     def pre_register_and_update(
-        cls, parser: Optional[FlexibleArgumentParser] = None
+        cls, parser: FlexibleArgumentParser | None = None
     ) -> None:
         """
         Do some pre-registration or update action for the current platform.
@@ -389,7 +387,7 @@ class Platform:
 
     @classmethod
     def get_current_memory_usage(
-        cls, device: Optional[torch.types.Device] = None
+        cls, device: torch.types.Device | None = None
     ) -> float:
         """
         Return the memory usage in bytes.
@@ -501,7 +499,7 @@ class Platform:
     def validate_request(
         cls,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams | PoolingParams,
         processed_inputs: ProcessorInputs,
     ) -> None:
         """Raises if this request is unsupported on this platform"""
@@ -557,7 +555,7 @@ class Platform:
 
     @classmethod
     def is_kv_cache_dtype_supported(
-        cls, kv_cache_dtype: str, model_config: "ModelConfig"
+        cls, kv_cache_dtype: str, model_config: ModelConfig
     ) -> bool:
         """
         Returns if the kv_cache_dtype is supported by the current platform.
@@ -617,7 +615,7 @@ class Platform:
         return {}
 
     @classmethod
-    def get_nixl_memory_type(cls) -> Optional[str]:
+    def get_nixl_memory_type(cls) -> str | None:
         """
         Returns the nixl memory type for the current platform.
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 95d3fa74e325d..81745257d0ae2 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -4,7 +4,7 @@
 import os
 from datetime import timedelta
 from functools import cache, lru_cache, wraps
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 import torch
 from torch.distributed import PrefixStore, ProcessGroup
@@ -140,8 +140,8 @@ def use_rocm_custom_paged_attention(
     max_seq_len: int,
     sliding_window: int,
     kv_cache_dtype: str,
-    alibi_slopes: Optional[torch.Tensor] = None,
-    sinks: Optional[torch.Tensor] = None,
+    alibi_slopes: torch.Tensor | None = None,
+    sinks: torch.Tensor | None = None,
 ) -> bool:
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
     ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
@@ -320,7 +320,7 @@ class RocmPlatform(Platform):
 
     @classmethod
     @lru_cache(maxsize=8)
-    def get_device_capability(cls, device_id: int = 0) -> Optional[DeviceCapability]:
+    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None:
         major, minor = torch.cuda.get_device_capability(device_id)
         return DeviceCapability(major=major, minor=minor)
 
@@ -420,7 +420,7 @@ class RocmPlatform(Platform):
 
     @classmethod
     def get_current_memory_usage(
-        cls, device: Optional[torch.types.Device] = None
+        cls, device: torch.types.Device | None = None
     ) -> float:
         torch.cuda.reset_peak_memory_stats(device)
         return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(device)[0]
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 1c323ba8200a2..dcd595cf9082f 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import contextlib
-from typing import TYPE_CHECKING, Optional, Union, cast
+from typing import TYPE_CHECKING, cast
 
 import torch
 from tpu_info import device
@@ -57,7 +57,7 @@ class TpuPlatform(Platform):
         selected_backend: "_Backend",
         head_size: int,
         dtype: torch.dtype,
-        kv_cache_dtype: Optional[str],
+        kv_cache_dtype: str | None,
         block_size: int,
         use_v1: bool,
         use_mla: bool,
@@ -211,7 +211,7 @@ class TpuPlatform(Platform):
     def validate_request(
         cls,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams | PoolingParams,
         processed_inputs: ProcessorInputs,
     ) -> None:
         """Raises if this request is unsupported on this platform"""
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index b75b52938839b..dcfc970d3a83d 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -3,7 +3,7 @@
 
 import contextlib
 import os
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 import torch
 
@@ -47,7 +47,7 @@ class XPUPlatform(Platform):
         selected_backend: "_Backend",
         head_size: int,
         dtype: torch.dtype,
-        kv_cache_dtype: Optional[str],
+        kv_cache_dtype: str | None,
         block_size: int,
         use_v1: bool,
         use_mla: bool,
@@ -113,7 +113,7 @@ class XPUPlatform(Platform):
     def get_device_capability(
         cls,
         device_id: int = 0,
-    ) -> Optional[DeviceCapability]:
+    ) -> DeviceCapability | None:
         # capacity format differs from cuda's and will cause unexpected
         # failure, so use None directly
         return None
@@ -213,7 +213,7 @@ class XPUPlatform(Platform):
 
     @classmethod
     def get_current_memory_usage(
-        cls, device: Optional[torch.types.Device] = None
+        cls, device: torch.types.Device | None = None
     ) -> float:
         torch.xpu.reset_peak_memory_stats(device)
         return torch.xpu.max_memory_allocated(device)
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 094bda3f9369e..fe04b759a12c8 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -2,7 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import logging
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import vllm.envs as envs
 
diff --git a/vllm/plugins/io_processors/__init__.py b/vllm/plugins/io_processors/__init__.py
index 7a914442c4ab8..cb58bfe75f1d7 100644
--- a/vllm/plugins/io_processors/__init__.py
+++ b/vllm/plugins/io_processors/__init__.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import logging
 
 from vllm.config import VllmConfig
diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py
index 84af40d01c439..81e077d5bdacc 100644
--- a/vllm/plugins/io_processors/interface.py
+++ b/vllm/plugins/io_processors/interface.py
@@ -3,7 +3,7 @@
 
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Sequence
-from typing import Any, Generic, Optional, TypeVar, Union
+from typing import Any, Generic, TypeVar
 
 from vllm.config import VllmConfig
 from vllm.entrypoints.openai.protocol import IOProcessorResponse
@@ -22,24 +22,24 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
     def pre_process(
         self,
         prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
+    ) -> PromptType | Sequence[PromptType]:
         raise NotImplementedError
 
     async def pre_process_async(
         self,
         prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
+    ) -> PromptType | Sequence[PromptType]:
         return self.pre_process(prompt, request_id, **kwargs)
 
     @abstractmethod
     def post_process(
         self,
         model_output: Sequence[PoolingRequestOutput],
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
     ) -> IOProcessorOutput:
         raise NotImplementedError
@@ -47,7 +47,7 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
     async def post_process_async(
         self,
         model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
-        request_id: Optional[str] = None,
+        request_id: str | None = None,
         **kwargs,
     ) -> IOProcessorOutput:
         # We cannot guarantee outputs are returned in the same order they were
diff --git a/vllm/plugins/lora_resolvers/filesystem_resolver.py b/vllm/plugins/lora_resolvers/filesystem_resolver.py
index c3255af457026..8d94a673e8623 100644
--- a/vllm/plugins/lora_resolvers/filesystem_resolver.py
+++ b/vllm/plugins/lora_resolvers/filesystem_resolver.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import os
-from typing import Optional
 
 import vllm.envs as envs
 from vllm.lora.request import LoRARequest
@@ -15,7 +14,7 @@ class FilesystemResolver(LoRAResolver):
 
     async def resolve_lora(
         self, base_model_name: str, lora_name: str
-    ) -> Optional[LoRARequest]:
+    ) -> LoRARequest | None:
         lora_path = os.path.join(self.lora_cache_dir, lora_name)
         if os.path.exists(lora_path):
             adapter_config_path = os.path.join(
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index f7a53503e5841..175a4ac01b83e 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -34,32 +34,32 @@ class PoolingParams(
     """
 
     # --8<-- [start:common-pooling-params]
-    truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=-1)]] = None
+    truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None
     # --8<-- [end:common-pooling-params]
 
     ## for embeddings models
     # --8<-- [start:embedding-pooling-params]
-    dimensions: Optional[int] = None
-    normalize: Optional[bool] = None
+    dimensions: int | None = None
+    normalize: bool | None = None
     # --8<-- [end:embedding-pooling-params]
 
     ## for classification, scoring and rerank
     # --8<-- [start:classification-pooling-params]
-    activation: Optional[bool] = None
+    activation: bool | None = None
     # --8<-- [end:classification-pooling-params]
 
     ## for reward models
-    softmax: Optional[bool] = None
-    step_tag_id: Optional[int] = None
-    returned_token_ids: Optional[list[int]] = None
+    softmax: bool | None = None
+    step_tag_id: int | None = None
+    returned_token_ids: list[int] | None = None
 
-    task: Optional[PoolingTask] = None
+    task: PoolingTask | None = None
     """Internal use only."""
 
     requires_token_ids: bool = False
     """Internal use only."""
 
-    extra_kwargs: Optional[dict[str, Any]] = None
+    extra_kwargs: dict[str, Any] | None = None
     """Internal use only."""
 
     output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py
index fea299b287f98..1c0fce702b3fa 100644
--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
@@ -3,8 +3,9 @@
 
 import copy
 from collections import defaultdict
+from collections.abc import Callable
 from dataclasses import asdict, dataclass, field
-from typing import Any, Callable, Optional, TypeAlias, Union
+from typing import Any, Optional, TypeAlias
 
 import pandas as pd
 from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult
@@ -62,14 +63,14 @@ class ModelStatsEntry:
     trace: str
 
 
-StatsEntry: TypeAlias = Union[ModelStatsEntry, SummaryStatsEntry]
+StatsEntry: TypeAlias = ModelStatsEntry | SummaryStatsEntry
 
 
 @dataclass
 class _StatsTreeNode:
     entry: StatsEntry
     children: list[StatsEntry]
-    parent: Optional[StatsEntry]
+    parent: StatsEntry | None
 
 
 @dataclass
@@ -82,7 +83,7 @@ class LayerwiseProfileResults(profile):
     _summary_stats_tree: list[_StatsTreeNode] = field(init=False)
 
     # profile metadata
-    num_running_seqs: Optional[int] = None
+    num_running_seqs: int | None = None
 
     def __post_init__(self):
         self._build_correlation_map()
@@ -150,7 +151,7 @@ class LayerwiseProfileResults(profile):
     @staticmethod
     def _indent_row_names_based_on_depth(
         depths_rows: list[tuple[int, StatsEntry]],
-        indent_style: Union[Callable[[int], str], str] = " ",
+        indent_style: Callable[[int], str] | str = " ",
     ):
         indented_rows = []
         for depth, row in depths_rows:
@@ -171,7 +172,7 @@ class LayerwiseProfileResults(profile):
         event_tree = self._kineto_results.experimental_event_tree()
 
         def _df_traversal(
-            event: _ProfilerEvent, curr_node: Optional[_ModuleTreeNode] = None
+            event: _ProfilerEvent, curr_node: _ModuleTreeNode | None = None
         ):
             # For the tensor parallel case for now only look at task 1
             if event.start_tid != 1:
@@ -242,7 +243,7 @@ class LayerwiseProfileResults(profile):
 
         def build_summary_stats_tree_df(
             node: _ModuleTreeNode,
-            parent: Optional[_StatsTreeNode] = None,
+            parent: _StatsTreeNode | None = None,
             summary_trace: tuple[str] = (),
         ):
             if event_has_module(node.event):
@@ -287,7 +288,7 @@ class LayerwiseProfileResults(profile):
             self._summary_stats_tree.append(build_summary_stats_tree_df(root))
 
         def build_model_stats_tree_df(
-            node: _ModuleTreeNode, parent: Optional[_StatsTreeNode] = None
+            node: _ModuleTreeNode, parent: _StatsTreeNode | None = None
         ):
             if event_has_module(
                 node.event,
@@ -357,7 +358,7 @@ class LayerwiseProfileResults(profile):
 
 
 class layerwise_profile(profile):
-    def __init__(self, num_running_seqs: Optional[int] = None):
+    def __init__(self, num_running_seqs: int | None = None):
         """
         layerwise profile constructor.
 
diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py
index b3607fbecde78..c95f9f4ac9779 100644
--- a/vllm/profiler/utils.py
+++ b/vllm/profiler/utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
-from typing import Callable, Union
+from collections.abc import Callable
 
 from torch._C._profiler import _EventType, _ProfilerEvent, _TensorMetadata
 
@@ -78,7 +78,7 @@ class TablePrinter:
 
 
 def indent_string(
-    string: str, indent: int, indent_style: Union[Callable[[int], str], str] = " "
+    string: str, indent: int, indent_style: Callable[[int], str] | str = " "
 ) -> str:
     if indent:
         if isinstance(indent_style, str):
diff --git a/vllm/ray/ray_env.py b/vllm/ray/ray_env.py
index a89e55bd7e4b6..85623cfe5ff57 100644
--- a/vllm/ray/ray_env.py
+++ b/vllm/ray/ray_env.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 import os
-from typing import Optional
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -32,9 +31,9 @@ except json.JSONDecodeError:
 
 
 def get_env_vars_to_copy(
-    exclude_vars: Optional[set[str]] = None,
-    additional_vars: Optional[set[str]] = None,
-    destination: Optional[str] = None,
+    exclude_vars: set[str] | None = None,
+    additional_vars: set[str] | None = None,
+    destination: str | None = None,
 ) -> set[str]:
     """
     Get the environment variables to copy to downstream Ray actors.
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 2d93f0702f721..b85216f43fadc 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import os
 from abc import abstractmethod
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Callable, Union
+from typing import TYPE_CHECKING, Any
 
 from vllm.logger import init_logger
 from vllm.utils import import_from_path, is_list_of
@@ -78,7 +76,7 @@ class ReasoningParser:
     def extract_reasoning_content(
         self,
         model_output: str,
-        request: Union[ChatCompletionRequest, ResponsesRequest],
+        request: ChatCompletionRequest | ResponsesRequest,
     ) -> tuple[str | None, str | None]:
         """
         Extract reasoning content from a complete model-generated string.
@@ -107,7 +105,7 @@ class ReasoningParser:
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Instance method that should be implemented for extracting reasoning
         from an incomplete response; for use when handling reasoning calls and
@@ -136,7 +134,7 @@ class ReasoningParserManager:
     def _register_module(
         cls,
         module: type,
-        module_name: Union[str, list[str]] | None = None,
+        module_name: str | list[str] | None = None,
         force: bool = True,
     ) -> None:
         if not issubclass(module, ReasoningParser):
@@ -158,10 +156,10 @@ class ReasoningParserManager:
     @classmethod
     def register_module(
         cls,
-        name: Union[str, list[str]] | None = None,
+        name: str | list[str] | None = None,
         force: bool = True,
-        module: Union[type, None] = None,
-    ) -> Union[type, Callable]:
+        module: type | None = None,
+    ) -> type | Callable:
         """
         Register module with the given name or name list. it can be used as a
         decoder(with module as None) or normal function(with module as not
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index f47ffe6212caf..621a73b2a59f0 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -3,7 +3,6 @@
 
 from abc import abstractmethod
 from collections.abc import Sequence
-from typing import Optional, Union
 
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
@@ -79,7 +78,7 @@ class BaseThinkingReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Extract reasoning content from a delta message.
         Handles streaming output where previous + delta = current.
@@ -135,8 +134,8 @@ class BaseThinkingReasoningParser(ReasoningParser):
             return DeltaMessage(content=delta_text)
 
     def extract_reasoning_content(
-        self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest]
-    ) -> tuple[Optional[str], Optional[str]]:
+        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
+    ) -> tuple[str | None, str | None]:
         """
         Extract reasoning content from the model output.
 
diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
index 264da54b48793..d5200145ea03e 100644
--- a/vllm/reasoning/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Union
 
 from vllm.entrypoints.openai.protocol import DeltaMessage
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
@@ -36,7 +35,7 @@ class DeepSeekR1ReasoningParser(BaseThinkingReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         ret = super().extract_reasoning_content_streaming(
             previous_text,
             current_text,
diff --git a/vllm/reasoning/glm4_moe_reasoning_parser.py b/vllm/reasoning/glm4_moe_reasoning_parser.py
index da98515c7e629..09cd43c1d555e 100644
--- a/vllm/reasoning/glm4_moe_reasoning_parser.py
+++ b/vllm/reasoning/glm4_moe_reasoning_parser.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional, Union
 
 from transformers import PreTrainedTokenizerBase
 
@@ -80,7 +79,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Extract reasoning content from a delta message.
         Handles streaming output where previous + delta = current.
@@ -137,7 +136,7 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
 
     def extract_reasoning_content(
         self, model_output: str, request: ChatCompletionRequest
-    ) -> tuple[Optional[str], Optional[str]]:
+    ) -> tuple[str | None, str | None]:
         """
         Extract reasoning content from the model output.
 
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index 738c7b51694a0..ccb2d9553c9f0 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional, Union
 
 from transformers import PreTrainedTokenizerBase
 
@@ -53,7 +52,7 @@ class GptOssReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         prev_reasoning, prev_content, _ = parse_chat_output(list(previous_token_ids))
         cur_reasoning, cur_content, _ = parse_chat_output(list(current_token_ids))
         reasoning_delta = None
@@ -78,7 +77,7 @@ class GptOssReasoningParser(ReasoningParser):
         self,
         model_output: str,
         request: ChatCompletionRequest,
-    ) -> tuple[Optional[str], Optional[str]]:
+    ) -> tuple[str | None, str | None]:
         raise NotImplementedError(
             "gpt-oss has a special branch for parsing reasoning in non-streaming mode. This method shouldn't be used."  # noqa: E501
         )
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
index 543b202989ee9..44391f8ad6351 100644
--- a/vllm/reasoning/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional, Union
 
 import regex as re
 from transformers import PreTrainedTokenizerBase
@@ -53,7 +52,7 @@ class GraniteReasoningParser(ReasoningParser):
 
     def extract_reasoning_content(
         self, model_output: str, request: ChatCompletionRequest
-    ) -> tuple[Optional[str], Optional[str]]:
+    ) -> tuple[str | None, str | None]:
         """Extract the reasoning content & content sections, respectively.
         If the sequence doesn't match what we expect, i.e., the model generates
         something else, all content is considered non-reasoning content.
@@ -82,7 +81,7 @@ class GraniteReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """Extract the reasoning content / content emitted by granite models;
         If the sequence doesn't match what we expect, i.e., the model generates
         something else, all content is considered non-reasoning content.
@@ -322,7 +321,7 @@ class GraniteReasoningParser(ReasoningParser):
 
     def _get_content_sections(
         self, current_text: str
-    ) -> tuple[Optional[str], Optional[int], Optional[str]]:
+    ) -> tuple[str | None, int | None, str | None]:
         """Parse the text to extract the reasoning content / content
         if we have them.
 
diff --git a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
index 381f1b5f34667..e5cf6f399740f 100644
--- a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
+++ b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional, Union
 
 import regex as re
 from transformers import PreTrainedTokenizerBase
@@ -90,7 +89,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
 
     def extract_reasoning_content(
         self, model_output: str, request: ChatCompletionRequest
-    ) -> tuple[Optional[str], Optional[str]]:
+    ) -> tuple[str | None, str | None]:
         """Extract the reasoning content & content sections, respectively.
         If the sequence doesn't match what we expect, i.e., the model generates
         something else, all content is considered non-reasoning content.
@@ -150,7 +149,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """Extract content using token ID sequence state machine"""
         # Define sequences
         think_start_sequence = self.think_start_ids
diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py
index b330e8b1fdd5b..b6c26899a1148 100644
--- a/vllm/reasoning/olmo3_reasoning_parser.py
+++ b/vllm/reasoning/olmo3_reasoning_parser.py
@@ -4,7 +4,7 @@
 import dataclasses as dt
 import enum
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 import regex as re
 
@@ -36,7 +36,7 @@ class Indices:
         return self.end - self.start
 
 
-def string_overlap(a: str, b: str) -> tuple[Optional[Indices], Optional[Indices]]:
+def string_overlap(a: str, b: str) -> tuple[Indices | None, Indices | None]:
     """
     Find the longest overlap where the end of string a matches the start
     of string b.
@@ -90,7 +90,7 @@ class Olmo3ReasoningBuffer:
     # is when we switch to content state.
     state: Olmo3ReasoningState = Olmo3ReasoningState.REASONING
 
-    def process_buffer(self) -> Optional[DeltaMessage]:
+    def process_buffer(self) -> DeltaMessage | None:
         start_think_idx = self.buffer.find(self.think_start)
 
         if start_think_idx >= 0:
@@ -142,12 +142,12 @@ class Olmo3ReasoningBuffer:
         # is the length of the text buffer
         return len(self.buffer)
 
-    def add_text(self, delta_text: str) -> Optional[DeltaMessage]:
+    def add_text(self, delta_text: str) -> DeltaMessage | None:
         # we start by adding the delta text to the buffer
         self.buffer += delta_text
 
         # setting this to empty before starting
-        delta_message: Optional[DeltaMessage] = None
+        delta_message: DeltaMessage | None = None
 
         # we start by computing the overlap between the delta_text
         # and start/end of think tokens.
@@ -254,8 +254,8 @@ class Olmo3ReasoningParser(ReasoningParser):
     def extract_reasoning_content(
         self,
         model_output: str,
-        request: Union[ChatCompletionRequest, ResponsesRequest],
-    ) -> tuple[Optional[str], Optional[str]]:
+        request: ChatCompletionRequest | ResponsesRequest,
+    ) -> tuple[str | None, str | None]:
         """Extract the reasoning content & content sections, respectively.
         If the sequence doesn't match what we expect, i.e., the model generates
         something else, all content is considered non-reasoning content.
@@ -287,7 +287,7 @@ class Olmo3ReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """Extract content using token ID sequence state machine"""
 
         delta_message = self.buffer.add_text(delta_text)
diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py
index 160e8633a43fd..2ec06720719da 100644
--- a/vllm/reasoning/qwen3_reasoning_parser.py
+++ b/vllm/reasoning/qwen3_reasoning_parser.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ResponsesRequest
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
@@ -31,8 +30,8 @@ class Qwen3ReasoningParser(BaseThinkingReasoningParser):
         return "</think>"
 
     def extract_reasoning_content(
-        self, model_output: str, request: Union[ChatCompletionRequest, ResponsesRequest]
-    ) -> tuple[Optional[str], Optional[str]]:
+        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
+    ) -> tuple[str | None, str | None]:
         """
         Extract reasoning content from the model output.
 
diff --git a/vllm/reasoning/step3_reasoning_parser.py b/vllm/reasoning/step3_reasoning_parser.py
index c9f580077b338..ae066d96f2505 100644
--- a/vllm/reasoning/step3_reasoning_parser.py
+++ b/vllm/reasoning/step3_reasoning_parser.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Sequence
-from typing import Optional, Union
 
 import regex as re
 from transformers import PreTrainedTokenizerBase
@@ -50,7 +49,7 @@ class Step3ReasoningParser(ReasoningParser):
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Extract reasoning content from a delta message.
         Handles streaming output where previous + delta = current.
@@ -81,7 +80,7 @@ class Step3ReasoningParser(ReasoningParser):
 
     def extract_reasoning_content(
         self, model_output: str, request: ChatCompletionRequest
-    ) -> tuple[Optional[str], Optional[str]]:
+    ) -> tuple[str | None, str | None]:
         # Check if the model output contains the </think> token
         if self.think_end_token not in model_output:
             # If no </think> token, everything is reasoning content
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index a1ff4e5ff63b2..76b89634f508c 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -7,7 +7,7 @@ import warnings
 from dataclasses import field
 from enum import Enum, IntEnum
 from functools import cached_property
-from typing import Annotated, Any, Optional, Union
+from typing import Annotated, Any
 
 import msgspec
 from pydantic.dataclasses import dataclass
@@ -32,19 +32,19 @@ class SamplingType(IntEnum):
 @dataclass
 class StructuredOutputsParams:
     # One of these fields will be used to build a logit processor.
-    json: Optional[Union[str, dict]] = None
-    regex: Optional[str] = None
-    choice: Optional[list[str]] = None
-    grammar: Optional[str] = None
-    json_object: Optional[bool] = None
+    json: str | dict | None = None
+    regex: str | None = None
+    choice: list[str] | None = None
+    grammar: str | None = None
+    json_object: bool | None = None
     # These are other options that can be set.
     disable_fallback: bool = False
     disable_any_whitespace: bool = False
     disable_additional_properties: bool = False
-    whitespace_pattern: Optional[str] = None
-    structural_tag: Optional[str] = None
+    whitespace_pattern: str | None = None
+    structural_tag: str | None = None
 
-    _backend: Optional[str] = field(default=None, init=False)
+    _backend: str | None = field(default=None, init=False)
     """CAUTION: Should only be set by Processor._validate_structured_output"""
     _backend_was_auto: bool = field(default=False, init=False)
     """CAUTION: Should only be set by Processor._validate_structured_output"""
@@ -110,12 +110,12 @@ class SamplingParams(
         are generated and streamed cumulatively per request. To see all `n`
         outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY`
         in `SamplingParams`."""
-    best_of: Optional[int] = None
+    best_of: int | None = None
     """Number of output sequences that are generated from the prompt. From
     these `best_of` sequences, the top `n` sequences are returned. `best_of`
     must be greater than or equal to `n`. By default, `best_of` is set to `n`.
     Warning, this is only supported in V0."""
-    _real_n: Optional[int] = None
+    _real_n: int | None = None
     presence_penalty: float = 0.0
     """Penalizes new tokens based on whether they appear in the generated text
     so far. Values > 0 encourage the model to use new tokens, while values < 0
@@ -142,24 +142,24 @@ class SamplingParams(
     """Represents the minimum probability for a token to be considered,
     relative to the probability of the most likely token. Must be in [0, 1].
     Set to 0 to disable this."""
-    seed: Optional[int] = None
+    seed: int | None = None
     """Random seed to use for the generation."""
-    stop: Optional[Union[str, list[str]]] = None
+    stop: str | list[str] | None = None
     """String(s) that stop the generation when they are generated. The returned
     output will not contain the stop strings."""
-    stop_token_ids: Optional[list[int]] = None
+    stop_token_ids: list[int] | None = None
     """Token IDs that stop the generation when they are generated. The returned
     output will contain the stop tokens unless the stop tokens are special
     tokens."""
     ignore_eos: bool = False
     """Whether to ignore the EOS token and continue generating
     tokens after the EOS token is generated."""
-    max_tokens: Optional[int] = 16
+    max_tokens: int | None = 16
     """Maximum number of tokens to generate per output sequence."""
     min_tokens: int = 0
     """Minimum number of tokens to generate per output sequence before EOS or
     `stop_token_ids` can be generated"""
-    logprobs: Optional[int] = None
+    logprobs: int | None = None
     """Number of log probabilities to return per output token. When set to
     `None`, no probability is returned. If set to a non-`None` value, the
     result includes the log probabilities of the specified number of most
@@ -167,7 +167,7 @@ class SamplingParams(
     follows the OpenAI API: The API will always return the log probability of
     the sampled token, so there may be up to `logprobs+1` elements in the
     response. When set to -1, return all `vocab_size` log probabilities."""
-    prompt_logprobs: Optional[int] = None
+    prompt_logprobs: int | None = None
     """Number of log probabilities to return per prompt token.
     When set to -1, return all `vocab_size` log probabilities."""
     # NOTE: This parameter is only exposed at the engine level for now.
@@ -179,14 +179,14 @@ class SamplingParams(
     """Whether to skip special tokens in the output."""
     spaces_between_special_tokens: bool = True
     """Whether to add spaces between special tokens in the output."""
-    # Optional[list[LogitsProcessor]] type. We use Any here because
-    # Optional[list[LogitsProcessor]] type is not supported by msgspec.
-    logits_processors: Optional[Any] = None
+    # `list[LogitsProcessor] | None` type. We use Any here because
+    # `list[LogitsProcessor] | None` type is not supported by msgspec.
+    logits_processors: Any | None = None
     """Functions that modify logits based on previously generated tokens, and
     optionally prompt tokens as a first argument."""
     include_stop_str_in_output: bool = False
     """Whether to include the stop strings in output text."""
-    truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=-1)]] = None
+    truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None
     """If set to -1, will use the truncation size supported by the model. If
     set to an integer k, will use only the last k tokens from the prompt
     (i.e., left truncation). If set to `None`, truncation is disabled."""
@@ -198,60 +198,60 @@ class SamplingParams(
     _all_stop_token_ids: set[int] = msgspec.field(default_factory=set)
 
     # Fields used to construct logits processors
-    structured_outputs: Optional[StructuredOutputsParams] = None
+    structured_outputs: StructuredOutputsParams | None = None
     """Parameters for configuring structured outputs."""
-    guided_decoding: Optional[GuidedDecodingParams] = None
+    guided_decoding: GuidedDecodingParams | None = None
     """Deprecated alias for structured_outputs."""
-    logit_bias: Optional[dict[int, float]] = None
+    logit_bias: dict[int, float] | None = None
     """If provided, the engine will construct a logits processor that applies
     these logit biases."""
-    allowed_token_ids: Optional[list[int]] = None
+    allowed_token_ids: list[int] | None = None
     """If provided, the engine will construct a logits processor which only
     retains scores for the given token ids."""
-    extra_args: Optional[dict[str, Any]] = None
+    extra_args: dict[str, Any] | None = None
     """Arbitrary additional args, that can be used by custom sampling
     implementations, plugins, etc. Not used by any in-tree sampling
     implementations."""
 
     # Fields used for bad words
-    bad_words: Optional[list[str]] = None
+    bad_words: list[str] | None = None
     """Words that are not allowed to be generated. More precisely, only the
     last token of a corresponding token sequence is not allowed when the next
     generated token can complete the sequence."""
-    _bad_words_token_ids: Optional[list[list[int]]] = None
+    _bad_words_token_ids: list[list[int]] | None = None
 
     @staticmethod
     def from_optional(
-        n: Optional[int] = 1,
-        best_of: Optional[int] = None,
-        presence_penalty: Optional[float] = 0.0,
-        frequency_penalty: Optional[float] = 0.0,
-        repetition_penalty: Optional[float] = 1.0,
-        temperature: Optional[float] = 1.0,
-        top_p: Optional[float] = 1.0,
+        n: int | None = 1,
+        best_of: int | None = None,
+        presence_penalty: float | None = 0.0,
+        frequency_penalty: float | None = 0.0,
+        repetition_penalty: float | None = 1.0,
+        temperature: float | None = 1.0,
+        top_p: float | None = 1.0,
         top_k: int = 0,
         min_p: float = 0.0,
-        seed: Optional[int] = None,
-        stop: Optional[Union[str, list[str]]] = None,
-        stop_token_ids: Optional[list[int]] = None,
-        bad_words: Optional[list[str]] = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stop_token_ids: list[int] | None = None,
+        bad_words: list[str] | None = None,
         include_stop_str_in_output: bool = False,
         ignore_eos: bool = False,
-        max_tokens: Optional[int] = 16,
+        max_tokens: int | None = 16,
         min_tokens: int = 0,
-        logprobs: Optional[int] = None,
-        prompt_logprobs: Optional[int] = None,
+        logprobs: int | None = None,
+        prompt_logprobs: int | None = None,
         detokenize: bool = True,
         skip_special_tokens: bool = True,
         spaces_between_special_tokens: bool = True,
-        logits_processors: Optional[list[LogitsProcessor]] = None,
-        truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=-1)]] = None,
+        logits_processors: list[LogitsProcessor] | None = None,
+        truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None,
         output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
-        structured_outputs: Optional[StructuredOutputsParams] = None,
-        guided_decoding: Optional[GuidedDecodingParams] = None,
-        logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
-        allowed_token_ids: Optional[list[int]] = None,
-        extra_args: Optional[dict[str, Any]] = None,
+        structured_outputs: StructuredOutputsParams | None = None,
+        guided_decoding: GuidedDecodingParams | None = None,
+        logit_bias: dict[int, float] | dict[str, float] | None = None,
+        allowed_token_ids: list[int] | None = None,
+        extra_args: dict[str, Any] | None = None,
     ) -> "SamplingParams":
         if logit_bias is not None:
             # Convert token_id to integer
@@ -483,7 +483,7 @@ class SamplingParams(
     def update_from_generation_config(
         self,
         generation_config: dict[str, Any],
-        model_eos_token_id: Optional[int] = None,
+        model_eos_token_id: int | None = None,
     ) -> None:
         """Update if there are non-default values from generation_config"""
 
@@ -559,7 +559,7 @@ class SamplingParams(
         return self._all_stop_token_ids
 
     @property
-    def bad_words_token_ids(self) -> Optional[list[list[int]]]:
+    def bad_words_token_ids(self) -> list[list[int]] | None:
         # For internal use only. Backward compatibility not guaranteed
         return self._bad_words_token_ids
 
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index fd25d198bf1ab..05760f3f82998 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -5,7 +5,6 @@ import functools
 import struct
 from dataclasses import dataclass
 from enum import Enum
-from typing import Optional, Union
 
 _SCALAR_TYPES_ID_MAP = {}
 
@@ -105,7 +104,7 @@ class ScalarType:
         double_raw = self._floating_point_max_int()
         return struct.unpack("!d", struct.pack("!Q", double_raw))[0]
 
-    def _raw_max(self) -> Union[int, float]:
+    def _raw_max(self) -> int | float:
         if self.is_floating_point():
             return self._floating_point_max()
         else:
@@ -114,7 +113,7 @@ class ScalarType:
             )
             return (1 << self.mantissa) - 1
 
-    def _raw_min(self) -> Union[int, float]:
+    def _raw_min(self) -> int | float:
         if self.is_floating_point():
             assert self.is_signed(), (
                 "We currently assume all floating point types are signed"
@@ -168,14 +167,14 @@ class ScalarType:
     def size_bits(self) -> int:
         return self.exponent + self.mantissa + int(self.signed)
 
-    def min(self) -> Union[int, float]:
+    def min(self) -> int | float:
         """
         Min representable value for this scalar type.
         (accounting for bias if there is one)
         """
         return self._raw_min() - self.bias
 
-    def max(self) -> Union[int, float]:
+    def max(self) -> int | float:
         """
         Max representable value for this scalar type.
         (accounting for bias if there is one)
@@ -265,14 +264,14 @@ class ScalarType:
     #
 
     @classmethod
-    def int_(cls, size_bits: int, bias: Optional[int]) -> "ScalarType":
+    def int_(cls, size_bits: int, bias: int | None) -> "ScalarType":
         "Create a signed integer scalar type (size_bits includes sign-bit)."
         ret = cls(0, size_bits - 1, True, bias if bias else 0)
         ret.id  # noqa B018: make sure the id is cached
         return ret
 
     @classmethod
-    def uint(cls, size_bits: int, bias: Optional[int]) -> "ScalarType":
+    def uint(cls, size_bits: int, bias: int | None) -> "ScalarType":
         """Create an unsigned integer scalar type."""
         ret = cls(0, size_bits, False, bias if bias else 0)
         ret.id  # noqa B018: make sure the id is cached
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 7682b7f58305e..afa4e20e4502a 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -3,7 +3,7 @@
 """Sequence and its related classes."""
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
 import msgspec
 import torch
@@ -39,13 +39,13 @@ class RequestMetrics:
 
     arrival_time: float
     last_token_time: float
-    first_scheduled_time: Optional[float]
-    first_token_time: Optional[float]
-    time_in_queue: Optional[float]
-    finished_time: Optional[float] = None
-    scheduler_time: Optional[float] = None
-    model_forward_time: Optional[float] = None
-    model_execute_time: Optional[float] = None
+    first_scheduled_time: float | None
+    first_token_time: float | None
+    time_in_queue: float | None
+    finished_time: float | None = None
+    scheduler_time: float | None = None
+    model_forward_time: float | None = None
+    model_execute_time: float | None = None
 
 
 # cannot use msgspec.Struct here because Dynamo does not support it
@@ -59,7 +59,7 @@ class IntermediateTensors:
     """
 
     tensors: dict[str, torch.Tensor]
-    kv_connector_output: Optional[KVConnectorOutput]
+    kv_connector_output: KVConnectorOutput | None
 
     def __init__(self, tensors):
         # manually define this function, so that
@@ -68,7 +68,7 @@ class IntermediateTensors:
         # a string, and we will lose the information about the source file.
         self.tensors = tensors
 
-    def __getitem__(self, key: Union[str, slice]):
+    def __getitem__(self, key: str | slice):
         if isinstance(key, str):
             return self.tensors[key]
         elif isinstance(key, slice):
diff --git a/vllm/tracing.py b/vllm/tracing.py
index c9b595999fc78..7e3e883ca5f2d 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -3,7 +3,6 @@
 
 import os
 from collections.abc import Mapping
-from typing import Optional
 
 from vllm.logger import init_logger
 from vllm.utils import run_once
@@ -13,7 +12,7 @@ TRACE_HEADERS = ["traceparent", "tracestate"]
 logger = init_logger(__name__)
 
 _is_otel_imported = False
-otel_import_error_traceback: Optional[str] = None
+otel_import_error_traceback: str | None = None
 try:
     from opentelemetry.context.context import Context
     from opentelemetry.sdk.environment_variables import (
@@ -55,7 +54,7 @@ def is_otel_available() -> bool:
 
 def init_tracer(
     instrumenting_module_name: str, otlp_traces_endpoint: str
-) -> Optional[Tracer]:
+) -> Tracer | None:
     if not is_otel_available():
         raise ValueError(
             "OpenTelemetry is not available. Unable to initialize "
@@ -88,7 +87,7 @@ def get_span_exporter(endpoint):
     return OTLPSpanExporter(endpoint=endpoint)
 
 
-def extract_trace_context(headers: Optional[Mapping[str, str]]) -> Optional[Context]:
+def extract_trace_context(headers: Mapping[str, str] | None) -> Context | None:
     if is_otel_available():
         headers = headers or {}
         return TraceContextTextMapPropagator().extract(headers)
diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py
index b8d0cd8d2f208..afeac2335dc77 100644
--- a/vllm/transformers_utils/chat_templates/registry.py
+++ b/vllm/transformers_utils/chat_templates/registry.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
 from pathlib import Path
-from typing import Callable, Optional, Union
+from typing import TypeAlias
 
 from vllm.logger import init_logger
 
@@ -9,17 +10,17 @@ logger = init_logger(__file__)
 
 CHAT_TEMPLATES_DIR = Path(__file__).parent
 
-ChatTemplatePath = Union[Path, Callable[[str], Optional[Path]]]
+ChatTemplatePath: TypeAlias = Path | Callable[[str], Path | None]
 
 
-def _get_qwen_chat_template_fallback(tokenizer_name_or_path: str) -> Optional[Path]:
+def _get_qwen_chat_template_fallback(tokenizer_name_or_path: str) -> Path | None:
     if tokenizer_name_or_path.endswith("-Chat"):
         return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
 
     return CHAT_TEMPLATES_DIR / "template_basic.jinja"
 
 
-def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Optional[Path]:
+def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Path | None:
     # MiniCPM-V-4.5 version uses a dedicated template
     if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
         return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
@@ -58,7 +59,7 @@ def register_chat_template_fallback_path(
 def get_chat_template_fallback_path(
     model_type: str,
     tokenizer_name_or_path: str,
-) -> Optional[Path]:
+) -> Path | None:
     chat_template = _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK.get(model_type)
     if callable(chat_template):
         chat_template = chat_template(tokenizer_name_or_path)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4a8bb8f8b41de..58e0b53146026 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -4,10 +4,11 @@
 import json
 import os
 import time
+from collections.abc import Callable
 from dataclasses import asdict
 from functools import cache, partial
 from pathlib import Path
-from typing import Any, Callable, Literal, Optional, TypeVar, Union
+from typing import Any, Literal, TypeVar
 
 import huggingface_hub
 from huggingface_hub import (
@@ -47,7 +48,7 @@ MISTRAL_CONFIG_NAME = "params.json"
 logger = init_logger(__name__)
 
 
-def _get_hf_token() -> Optional[str]:
+def _get_hf_token() -> str | None:
     """
     Get the HuggingFace token from environment variable.
 
@@ -108,10 +109,10 @@ _AUTO_CONFIG_KWARGS_OVERRIDES: dict[str, dict[str, Any]] = {
 class HFConfigParser(ConfigParserBase):
     def parse(
         self,
-        model: Union[str, Path],
+        model: str | Path,
         trust_remote_code: bool,
-        revision: Optional[str] = None,
-        code_revision: Optional[str] = None,
+        revision: str | None = None,
+        code_revision: str | None = None,
         **kwargs,
     ) -> tuple[dict, PretrainedConfig]:
         kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
@@ -173,10 +174,10 @@ class HFConfigParser(ConfigParserBase):
 class MistralConfigParser(ConfigParserBase):
     def parse(
         self,
-        model: Union[str, Path],
+        model: str | Path,
         trust_remote_code: bool,
-        revision: Optional[str] = None,
-        code_revision: Optional[str] = None,
+        revision: str | None = None,
+        code_revision: str | None = None,
         **kwargs,
     ) -> tuple[dict, PretrainedConfig]:
         # This function loads a params.json config which
@@ -247,8 +248,8 @@ def register_config_parser(config_format: str):
          ...         self,
          ...         model: Union[str, Path],
          ...         trust_remote_code: bool,
-         ...         revision: Optional[str] = None,
-         ...         code_revision: Optional[str] = None,
+         ...         revision: str | None = None,
+         ...         code_revision: str | None = None,
          ...         **kwargs,
          ...     ) -> tuple[dict, PretrainedConfig]:
          ...         raise NotImplementedError
@@ -310,9 +311,9 @@ def with_retry(
 def list_repo_files(
     repo_id: str,
     *,
-    revision: Optional[str] = None,
-    repo_type: Optional[str] = None,
-    token: Union[str, bool, None] = None,
+    revision: str | None = None,
+    repo_type: str | None = None,
+    token: str | bool | None = None,
 ) -> list[str]:
     def lookup_files() -> list[str]:
         # directly list files if model is local
@@ -348,9 +349,9 @@ def file_exists(
     repo_id: str,
     file_name: str,
     *,
-    repo_type: Optional[str] = None,
-    revision: Optional[str] = None,
-    token: Union[str, bool, None] = None,
+    repo_type: str | None = None,
+    revision: str | None = None,
+    token: str | bool | None = None,
 ) -> bool:
     file_list = list_repo_files(
         repo_id, repo_type=repo_type, revision=revision, token=token
@@ -360,7 +361,7 @@ def file_exists(
 
 # In offline mode the result can be a false negative
 def file_or_path_exists(
-    model: Union[str, Path], config_name: str, revision: Optional[str]
+    model: str | Path, config_name: str, revision: str | None
 ) -> bool:
     if (local_path := Path(model)).exists():
         return (local_path / config_name).is_file()
@@ -493,10 +494,10 @@ def maybe_override_with_speculators(
     model: str,
     tokenizer: str,
     trust_remote_code: bool,
-    revision: Optional[str] = None,
-    vllm_speculative_config: Optional[dict[str, Any]] = None,
+    revision: str | None = None,
+    vllm_speculative_config: dict[str, Any] | None = None,
     **kwargs,
-) -> tuple[str, str, Optional[dict[str, Any]]]:
+) -> tuple[str, str, dict[str, Any] | None]:
     """
     Resolve model configuration when speculators are detected.
 
@@ -551,13 +552,13 @@ def maybe_override_with_speculators(
 
 
 def get_config(
-    model: Union[str, Path],
+    model: str | Path,
     trust_remote_code: bool,
-    revision: Optional[str] = None,
-    code_revision: Optional[str] = None,
-    config_format: Union[str, ConfigFormat] = "auto",
-    hf_overrides_kw: Optional[dict[str, Any]] = None,
-    hf_overrides_fn: Optional[Callable[[PretrainedConfig], PretrainedConfig]] = None,
+    revision: str | None = None,
+    code_revision: str | None = None,
+    config_format: str | ConfigFormat = "auto",
+    hf_overrides_kw: dict[str, Any] | None = None,
+    hf_overrides_fn: Callable[[PretrainedConfig], PretrainedConfig] | None = None,
     **kwargs,
 ) -> PretrainedConfig:
     # Separate model folder from file path for GGUF models
@@ -669,8 +670,8 @@ def get_config(
 
 
 def try_get_local_file(
-    model: Union[str, Path], file_name: str, revision: Optional[str] = "main"
-) -> Optional[Path]:
+    model: str | Path, file_name: str, revision: str | None = "main"
+) -> Path | None:
     file_path = Path(model) / file_name
     if file_path.is_file():
         return file_path
@@ -687,7 +688,7 @@ def try_get_local_file(
 
 
 def get_hf_file_to_dict(
-    file_name: str, model: Union[str, Path], revision: Optional[str] = "main"
+    file_name: str, model: str | Path, revision: str | None = "main"
 ):
     """
     Downloads a file from the Hugging Face Hub and returns
@@ -735,7 +736,7 @@ def get_hf_file_to_dict(
 
 
 @cache
-def get_pooling_config(model: str, revision: Optional[str] = "main") -> Optional[dict]:
+def get_pooling_config(model: str, revision: str | None = "main") -> dict | None:
     """
     This function gets the pooling and normalize
     config from the model - only applies to
@@ -799,7 +800,7 @@ def get_pooling_config(model: str, revision: Optional[str] = "main") -> Optional
     return None
 
 
-def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
+def get_pooling_config_name(pooling_name: str) -> str | None:
     if "pooling_mode_" in pooling_name:
         pooling_name = pooling_name.replace("pooling_mode_", "")
 
@@ -820,7 +821,7 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
 
 @cache
 def get_sentence_transformer_tokenizer_config(
-    model: Union[str, Path], revision: Optional[str] = "main"
+    model: str | Path, revision: str | None = "main"
 ):
     """
     Returns the tokenization configuration dictionary for a
@@ -958,9 +959,9 @@ def maybe_register_config_serialize_by_value() -> None:
 
 
 def get_hf_image_processor_config(
-    model: Union[str, Path],
-    hf_token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
+    model: str | Path,
+    hf_token: bool | str | None = None,
+    revision: str | None = None,
     **kwargs,
 ) -> dict[str, Any]:
     # ModelScope does not provide an interface for image_processor
@@ -992,9 +993,9 @@ def get_hf_text_config(config: PretrainedConfig):
 def try_get_generation_config(
     model: str,
     trust_remote_code: bool,
-    revision: Optional[str] = None,
-    config_format: Union[str, ConfigFormat] = "auto",
-) -> Optional[GenerationConfig]:
+    revision: str | None = None,
+    config_format: str | ConfigFormat = "auto",
+) -> GenerationConfig | None:
     try:
         return GenerationConfig.from_pretrained(
             model,
@@ -1016,7 +1017,7 @@ def try_get_generation_config(
 def try_get_safetensors_metadata(
     model: str,
     *,
-    revision: Optional[str] = None,
+    revision: str | None = None,
 ):
     get_safetensors_metadata_partial = partial(
         get_safetensors_metadata,
@@ -1034,10 +1035,10 @@ def try_get_safetensors_metadata(
 
 
 def try_get_tokenizer_config(
-    pretrained_model_name_or_path: Union[str, os.PathLike],
+    pretrained_model_name_or_path: str | os.PathLike,
     trust_remote_code: bool,
-    revision: Optional[str] = None,
-) -> Optional[dict[str, Any]]:
+    revision: str | None = None,
+) -> dict[str, Any] | None:
     try:
         return get_tokenizer_config(
             pretrained_model_name_or_path,
@@ -1051,7 +1052,7 @@ def try_get_tokenizer_config(
 def get_safetensors_params_metadata(
     model: str,
     *,
-    revision: Optional[str] = None,
+    revision: str | None = None,
 ) -> dict[str, Any]:
     """
     Get the safetensors metadata for remote model repository.
@@ -1112,7 +1113,7 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
     return max_position_embeddings
 
 
-def get_model_path(model: Union[str, Path], revision: Optional[str] = None):
+def get_model_path(model: str | Path, revision: str | None = None):
     if os.path.exists(model):
         return model
     assert huggingface_hub.constants.HF_HUB_OFFLINE
@@ -1132,8 +1133,8 @@ def get_model_path(model: Union[str, Path], revision: Optional[str] = None):
 
 
 def get_hf_file_bytes(
-    file_name: str, model: Union[str, Path], revision: Optional[str] = "main"
-) -> Optional[bytes]:
+    file_name: str, model: str | Path, revision: str | None = "main"
+) -> bytes | None:
     """Get file contents from HuggingFace repository as bytes."""
     file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
 
diff --git a/vllm/transformers_utils/config_parser_base.py b/vllm/transformers_utils/config_parser_base.py
index 0e1c49b428b07..79d47ff560420 100644
--- a/vllm/transformers_utils/config_parser_base.py
+++ b/vllm/transformers_utils/config_parser_base.py
@@ -3,7 +3,6 @@
 
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Optional, Union
 
 from transformers import PretrainedConfig
 
@@ -12,10 +11,10 @@ class ConfigParserBase(ABC):
     @abstractmethod
     def parse(
         self,
-        model: Union[str, Path],
+        model: str | Path,
         trust_remote_code: bool,
-        revision: Optional[str] = None,
-        code_revision: Optional[str] = None,
+        revision: str | None = None,
+        code_revision: str | None = None,
         **kwargs,
     ) -> tuple[dict, PretrainedConfig]:
         raise NotImplementedError
diff --git a/vllm/transformers_utils/configs/dotsocr.py b/vllm/transformers_utils/configs/dotsocr.py
index 446693b9a32eb..1e42cb2fd8594 100644
--- a/vllm/transformers_utils/configs/dotsocr.py
+++ b/vllm/transformers_utils/configs/dotsocr.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Any
 
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.qwen2 import Qwen2Config
@@ -57,7 +57,7 @@ class DotsOCRConfig(Qwen2Config):
         self,
         image_token_id=151665,
         video_token_id=151656,
-        vision_config: Optional[dict] = None,
+        vision_config: dict | None = None,
         *args,
         **kwargs,
     ):
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index 6e18513d12340..4da877f9e81f5 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Optional, Union
 
 from transformers import AutoConfig, PretrainedConfig
 
@@ -14,12 +13,12 @@ class EAGLEConfig(PretrainedConfig):
 
     def __init__(
         self,
-        model: Union[PretrainedConfig, dict, None] = None,
-        truncated_vocab_size: Optional[int] = None,
-        method: Optional[str] = "eagle",
+        model: PretrainedConfig | dict | None = None,
+        truncated_vocab_size: int | None = None,
+        method: str | None = "eagle",
         **kwargs,
     ):
-        model_config: Union[PretrainedConfig, DeepseekV2Config, None]
+        model_config: PretrainedConfig | DeepseekV2Config | None
         if isinstance(model, dict):
             archs = model.get("architectures", [])
             target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
@@ -84,7 +83,7 @@ class EAGLEConfig(PretrainedConfig):
     @classmethod
     def from_pretrained(
         cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
+        pretrained_model_name_or_path: str | os.PathLike,
         **kwargs,
     ) -> "EAGLEConfig":
         config_dict, kwargs = cls.get_config_dict(
diff --git a/vllm/transformers_utils/configs/kimi_vl.py b/vllm/transformers_utils/configs/kimi_vl.py
index 89a8878465b6d..e8c19d0ec2ffe 100644
--- a/vllm/transformers_utils/configs/kimi_vl.py
+++ b/vllm/transformers_utils/configs/kimi_vl.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
-from typing import Optional, Union
 
 from transformers.configuration_utils import PretrainedConfig
 
@@ -14,8 +13,8 @@ class KimiVLConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vision_config: Optional[Union[dict, MoonViTConfig]] = None,
-        text_config: Optional[Union[dict, DeepseekV2Config]] = None,
+        vision_config: dict | MoonViTConfig | None = None,
+        text_config: dict | DeepseekV2Config | None = None,
         ignore_index: int = -100,
         media_placeholder_token_id: int = 163605,
         pad_token_id: int = 0,
diff --git a/vllm/transformers_utils/configs/lfm2_moe.py b/vllm/transformers_utils/configs/lfm2_moe.py
index 7d17c2b4f74c5..37c038e12db80 100644
--- a/vllm/transformers_utils/configs/lfm2_moe.py
+++ b/vllm/transformers_utils/configs/lfm2_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 from transformers.configuration_utils import PretrainedConfig
 
@@ -115,7 +114,7 @@ class Lfm2MoeConfig(PretrainedConfig):
         use_expert_bias: bool = True,
         routed_scaling_factor: float = 1.0,
         norm_topk_prob: bool = True,
-        layer_types: Optional[list[str]] = None,
+        layer_types: list[str] | None = None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py
index 7dcfd0cf26aef..bfa0f30e8961f 100644
--- a/vllm/transformers_utils/configs/medusa.py
+++ b/vllm/transformers_utils/configs/medusa.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Optional, Union
 
 from transformers import PretrainedConfig
 
@@ -18,7 +17,7 @@ class MedusaConfig(PretrainedConfig):
         num_hidden_layers: int = 1,
         max_paths: int = 64,
         topk: int = 10,
-        truncated_vocab_size: Optional[int] = None,
+        truncated_vocab_size: int | None = None,
         **kwargs,
     ):
         self.hidden_size = hidden_size
@@ -39,7 +38,7 @@ class MedusaConfig(PretrainedConfig):
     @classmethod
     def from_pretrained(
         cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
+        pretrained_model_name_or_path: str | os.PathLike,
         **kwargs,
     ) -> "MedusaConfig":
         config_dict, kwargs = cls.get_config_dict(
diff --git a/vllm/transformers_utils/configs/midashenglm.py b/vllm/transformers_utils/configs/midashenglm.py
index 5c9e72be8ebff..e49bd26b2b00c 100644
--- a/vllm/transformers_utils/configs/midashenglm.py
+++ b/vllm/transformers_utils/configs/midashenglm.py
@@ -21,7 +21,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Union
 
 from transformers import PretrainedConfig
 from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
@@ -36,15 +35,15 @@ class DashengConfig(PretrainedConfig):
         self,
         embed_dim: int = 768,
         outputdim: int = 527,
-        patch_size: Union[int, tuple[int, int]] = 16,
-        patch_stride: Union[int, tuple[int, int]] = 16,
+        patch_size: int | tuple[int, int] = 16,
+        patch_stride: int | tuple[int, int] = 16,
         input_channels: int = 1,
         target_length: int = 1012,
         depth: int = 12,
         num_heads: int = 12,
         mlp_ratio: float = 4.0,
         qkv_bias: bool = True,
-        init_values: Optional[float] = None,
+        init_values: float | None = None,
         drop_rate: float = 0.0,
         attn_drop_rate: float = 0.0,
         f_min: float = 0.0,
@@ -86,10 +85,10 @@ class MiDashengLMConfig(PretrainedConfig):
 
     def __init__(
         self,
-        audio_encoder_config: Optional[dict] = None,
+        audio_encoder_config: dict | None = None,
         subsample_factor: int = 5,
-        text_config: Optional[dict] = None,
-        audio_token_id: Optional[int] = None,
+        text_config: dict | None = None,
+        audio_token_id: int | None = None,
         **kwargs,
     ):
         self.audio_encoder_config = DashengConfig(**(audio_encoder_config or {}))
diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py
index 45d76a8fdf264..75745f227f482 100644
--- a/vllm/transformers_utils/configs/mlp_speculator.py
+++ b/vllm/transformers_utils/configs/mlp_speculator.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 from transformers import PretrainedConfig
 
@@ -19,7 +18,7 @@ class MLPSpeculatorConfig(PretrainedConfig):
         emb_dim: int = 4096,
         inner_dim: int = 0,
         n_predict: int = 3,
-        top_k_tokens_per_head: Optional[list[int]] = None,
+        top_k_tokens_per_head: list[int] | None = None,
         n_candidates: int = 5,
         tie_weights: bool = False,
         scale_input: bool = False,
diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py
index 404fa700a26c0..294b4c9037aaf 100644
--- a/vllm/transformers_utils/configs/ovis.py
+++ b/vllm/transformers_utils/configs/ovis.py
@@ -5,7 +5,7 @@
 # adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
 # and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
 # Ovis Config with AimV2 config registration removed for Transformers compatibility
-from typing import Any, Optional, Union
+from typing import Any
 
 from transformers import AutoConfig, PretrainedConfig
 
@@ -76,7 +76,7 @@ class BaseVisualTokenizerConfig(PretrainedConfig):
         tau=1.0,
         depths=None,
         drop_cls_token=False,
-        backbone_config: Optional[Union[PretrainedConfig, dict]] = None,
+        backbone_config: PretrainedConfig | dict | None = None,
         hidden_stride: int = 1,
         **kwargs,
     ):
@@ -142,8 +142,8 @@ class OvisConfig(PretrainedConfig):
 
     def __init__(
         self,
-        llm_config: Optional[Union[PretrainedConfig, dict]] = None,
-        visual_tokenizer_config: Optional[Union[PretrainedConfig, dict]] = None,
+        llm_config: PretrainedConfig | dict | None = None,
+        visual_tokenizer_config: PretrainedConfig | dict | None = None,
         multimodal_max_length=8192,
         hidden_size=None,
         conversation_formatter_class=None,
diff --git a/vllm/transformers_utils/configs/radio.py b/vllm/transformers_utils/configs/radio.py
index f13598034bae8..2b6544fb273c2 100644
--- a/vllm/transformers_utils/configs/radio.py
+++ b/vllm/transformers_utils/configs/radio.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Radio vision model configuration"""
 
-from typing import Optional, Union
-
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 
@@ -60,9 +58,9 @@ class RadioConfig(PretrainedConfig):
         initializer_factor: float = 1.0,
         hidden_act: str = "gelu",
         max_img_size: int = 2048,
-        norm_mean: Union[tuple[float, float, float], list] = OPENAI_CLIP_MEAN,
-        norm_std: Union[tuple[float, float, float], list] = OPENAI_CLIP_STD,
-        reg_tokens: Optional[int] = None,
+        norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN,
+        norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD,
+        reg_tokens: int | None = None,
         **kwargs,
     ):
         self.model_name = model_name
diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py
index 1c415a43360ea..bf3a5d4131927 100644
--- a/vllm/transformers_utils/configs/speculators/base.py
+++ b/vllm/transformers_utils/configs/speculators/base.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
-from typing import Any, Union
+from typing import Any
 
 from transformers import PretrainedConfig
 
@@ -18,7 +18,7 @@ class SpeculatorsConfig(PretrainedConfig):
     @classmethod
     def from_pretrained(
         cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
+        pretrained_model_name_or_path: str | os.PathLike,
         **kwargs,
     ) -> "SpeculatorsConfig":
         """Load speculators Eagle config and convert to vLLM format."""
diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py
index 36d39e828a93b..637b82d88e265 100644
--- a/vllm/transformers_utils/configs/step3_vl.py
+++ b/vllm/transformers_utils/configs/step3_vl.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional, Union
+from typing import Any
 
 from transformers.configuration_utils import PretrainedConfig
 
@@ -53,7 +53,7 @@ class Step3TextConfig(PretrainedConfig):
         moe_num_experts: int = 48,
         moe_top_k: int = 3,
         rope_theta: float = 500000,
-        rope_scaling: Optional[dict[str, Any]] = None,
+        rope_scaling: dict[str, Any] | None = None,
         max_position_embedding: int = 65536,
         share_expert_dim: int = 5120,
         share_q_dim: int = 2048,
@@ -147,8 +147,8 @@ class Step3VLConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
-        text_config: Optional[Union[dict, Step3TextConfig]] = None,
+        vision_config: dict | Step3VisionEncoderConfig | None = None,
+        text_config: dict | Step3TextConfig | None = None,
         understand_projector_stride: int = 1,
         projector_bias: bool = True,
         image_token_id: int = 128001,
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index ac22304e91250..fc0360a9ecb4e 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
-from typing import Any, Optional
+from typing import Any
 
 import transformers
 
@@ -50,10 +50,10 @@ class UltravoxConfig(transformers.PretrainedConfig):
 
     def __init__(
         self,
-        audio_config: Optional[dict[str, Any]] = None,
-        text_config: Optional[dict[str, Any]] = None,
-        audio_model_id: Optional[str] = None,
-        text_model_id: Optional[str] = None,
+        audio_config: dict[str, Any] | None = None,
+        text_config: dict[str, Any] | None = None,
+        audio_model_id: str | None = None,
+        text_model_id: str | None = None,
         ignore_index: int = -100,
         audio_token_index: int = 32000,
         hidden_size: int = 4096,
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 60742ae97d5d1..560526bfd823e 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 from .tokenizer import AnyTokenizer
 
 
-def _replace_none_with_empty(tokens: list[Optional[str]]):
+def _replace_none_with_empty(tokens: list[str | None]):
     for i, token in enumerate(tokens):
         if token is None:
             tokens[i] = ""
@@ -111,7 +110,7 @@ def convert_ids_list_to_tokens(
 def detokenize_incrementally(
     tokenizer: AnyTokenizer,
     all_input_ids: list[int],
-    prev_tokens: Optional[list[str]],
+    prev_tokens: list[str] | None,
     prefix_offset: int,
     read_offset: int,
     skip_special_tokens: bool = False,
diff --git a/vllm/transformers_utils/dynamic_module.py b/vllm/transformers_utils/dynamic_module.py
index 3c273ad41da00..24ead83785f71 100644
--- a/vllm/transformers_utils/dynamic_module.py
+++ b/vllm/transformers_utils/dynamic_module.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
-from typing import Optional, Union
 
 from transformers.dynamic_module_utils import get_class_from_dynamic_module
 
@@ -14,18 +13,18 @@ logger = init_logger(__name__)
 def try_get_class_from_dynamic_module(
     class_reference: str,
     pretrained_model_name_or_path: str,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    cache_dir: str | os.PathLike | None = None,
     force_download: bool = False,
-    resume_download: Optional[bool] = None,
-    proxies: Optional[dict[str, str]] = None,
-    token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
+    resume_download: bool | None = None,
+    proxies: dict[str, str] | None = None,
+    token: bool | str | None = None,
+    revision: str | None = None,
     local_files_only: bool = False,
-    repo_type: Optional[str] = None,
-    code_revision: Optional[str] = None,
+    repo_type: str | None = None,
+    code_revision: str | None = None,
     warn_on_fail: bool = True,
     **kwargs,
-) -> Optional[type]:
+) -> type | None:
     """
     As `transformers.dynamic_module_utils.get_class_from_dynamic_module`,
     but ignoring any errors.
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 81f9b76b5ef7a..0a55ac96ccf89 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from functools import lru_cache
-from typing import TYPE_CHECKING, Any, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, cast
 
 from transformers import (
     AutoFeatureExtractor,
@@ -45,7 +45,7 @@ class HashableList(list):
         return hash(tuple(self))
 
 
-def _get_processor_factory_fn(processor_cls: Union[type, tuple[type, ...]]):
+def _get_processor_factory_fn(processor_cls: type | tuple[type, ...]):
     if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin:
         return AutoProcessor.from_pretrained
     if hasattr(processor_cls, "from_pretrained"):
@@ -56,7 +56,7 @@ def _get_processor_factory_fn(processor_cls: Union[type, tuple[type, ...]]):
 
 def _merge_mm_kwargs(
     model_config: "ModelConfig",
-    processor_cls: Union[type, tuple[type, ...]],
+    processor_cls: type | tuple[type, ...],
     /,
     **kwargs,
 ):
@@ -86,9 +86,9 @@ def _merge_mm_kwargs(
 def get_processor(
     processor_name: str,
     *args: Any,
-    revision: Optional[str] = None,
+    revision: str | None = None,
     trust_remote_code: bool = False,
-    processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+    processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
     """Load a processor for the given model name via HuggingFace."""
@@ -146,7 +146,7 @@ cached_get_processor = lru_cache(get_processor)
 
 def cached_processor_from_config(
     model_config: "ModelConfig",
-    processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+    processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
     return cached_get_processor(
@@ -161,7 +161,7 @@ def cached_processor_from_config(
 def get_feature_extractor(
     processor_name: str,
     *args: Any,
-    revision: Optional[str] = None,
+    revision: str | None = None,
     trust_remote_code: bool = False,
     **kwargs: Any,
 ):
@@ -211,7 +211,7 @@ def cached_feature_extractor_from_config(
 def get_image_processor(
     processor_name: str,
     *args: Any,
-    revision: Optional[str] = None,
+    revision: str | None = None,
     trust_remote_code: bool = False,
     **kwargs: Any,
 ):
@@ -261,9 +261,9 @@ def cached_image_processor_from_config(
 def get_video_processor(
     processor_name: str,
     *args: Any,
-    revision: Optional[str] = None,
+    revision: str | None = None,
     trust_remote_code: bool = False,
-    processor_cls_overrides: Optional[type[_V]] = None,
+    processor_cls_overrides: type[_V] | None = None,
     **kwargs: Any,
 ):
     """Load a video processor for the given model name via HuggingFace."""
@@ -300,7 +300,7 @@ cached_get_video_processor = lru_cache(get_video_processor)
 
 def cached_video_processor_from_config(
     model_config: "ModelConfig",
-    processor_cls: Optional[type[_V]] = None,
+    processor_cls: type[_V] | None = None,
     **kwargs: Any,
 ):
     return cached_get_video_processor(
diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py
index 58c1b1a91658b..252f833993652 100644
--- a/vllm/transformers_utils/processors/ovis.py
+++ b/vllm/transformers_utils/processors/ovis.py
@@ -23,7 +23,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from functools import cached_property
-from typing import Union
 
 import PIL
 import torch
@@ -104,9 +103,10 @@ class OvisProcessor(ProcessorMixin):
     def __call__(
         self,
         images: ImageInput = None,
-        text: Union[
-            TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
-        ] = None,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
         **kwargs: Unpack[OvisProcessorKwargs],
     ) -> BatchFeature:
         """
diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py
index bacc58c78b3f6..4c084fdccabc9 100644
--- a/vllm/transformers_utils/processors/ovis2_5.py
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from functools import cached_property
-from typing import Optional, Union
 
 import numpy as np
 import PIL
@@ -99,10 +98,11 @@ class Ovis2_5Processor(ProcessorMixin):
     def __call__(
         self,
         images: ImageInput = None,
-        videos: Union[np.ndarray, list[ImageInput]] = None,
-        text: Union[
-            TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
-        ] = None,
+        videos: np.ndarray | list[ImageInput] = None,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
         **kwargs: Unpack[Ovis2_5ProcessorKwargs],
     ) -> BatchFeature:
         """
@@ -376,12 +376,12 @@ class Ovis2_5Processor(ProcessorMixin):
 
     def preprocess_multidata(
         self,
-        images: Optional[Union[PIL.Image.Image, list[PIL.Image.Image]]] = None,
-        video: Optional[Union[list[PIL.Image.Image], np.ndarray]] = None,
-        convert_to_rgb: Optional[bool] = True,
+        images: PIL.Image.Image | list[PIL.Image.Image] | None = None,
+        video: list[PIL.Image.Image] | np.ndarray | None = None,
+        convert_to_rgb: bool | None = True,
         min_pixels: int = MIN_PIXELS,
         max_pixels: int = MAX_PIXELS,
-        return_tensors: Optional[str] = "pt",
+        return_tensors: str | None = "pt",
     ):
         is_video = False
         if images is not None:
diff --git a/vllm/transformers_utils/runai_utils.py b/vllm/transformers_utils/runai_utils.py
index ec60d66e5cff2..3f61a22adeb9f 100644
--- a/vllm/transformers_utils/runai_utils.py
+++ b/vllm/transformers_utils/runai_utils.py
@@ -5,7 +5,6 @@ import hashlib
 import os
 import shutil
 import signal
-from typing import Optional
 
 from vllm import envs
 from vllm.assets.base import get_cache_dir
@@ -88,8 +87,8 @@ class ObjectStorageModel:
     def pull_files(
         self,
         model_path: str = "",
-        allow_pattern: Optional[list[str]] = None,
-        ignore_pattern: Optional[list[str]] = None,
+        allow_pattern: list[str] | None = None,
+        ignore_pattern: list[str] | None = None,
     ) -> None:
         """
         Pull files from object storage into the temporary directory.
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index ef30efd80b1f7..c580361f92f95 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -34,7 +34,7 @@ def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
 def glob(
     s3: Optional["BaseClient"] = None,
     path: str = "",
-    allow_pattern: Optional[list[str]] = None,
+    allow_pattern: list[str] | None = None,
 ) -> list[str]:
     """
     List full file names from S3 path and filter by allow pattern.
@@ -58,8 +58,8 @@ def glob(
 def list_files(
     s3: "BaseClient",
     path: str,
-    allow_pattern: Optional[list[str]] = None,
-    ignore_pattern: Optional[list[str]] = None,
+    allow_pattern: list[str] | None = None,
+    ignore_pattern: list[str] | None = None,
 ) -> tuple[str, str, list[str]]:
     """
     List files from S3 path and filter by pattern.
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 9537295c6dcd2..54173c64a2075 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -7,7 +7,7 @@ import os
 import warnings
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, TypeAlias
 
 import huggingface_hub
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -30,14 +30,14 @@ else:
 
 logger = init_logger(__name__)
 
-AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast, TokenizerBase]
+AnyTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast | TokenizerBase
 
 
 def decode_tokens(
     tokenizer: AnyTokenizer,
     token_ids: list[int],
     *,
-    skip_special_tokens: Optional[bool] = None,
+    skip_special_tokens: bool | None = None,
 ) -> str:
     """
     Backend-agnostic equivalent of HF's
@@ -56,9 +56,9 @@ def encode_tokens(
     tokenizer: AnyTokenizer,
     text: str,
     *,
-    truncation: Optional[bool] = None,
-    max_length: Optional[int] = None,
-    add_special_tokens: Optional[bool] = None,
+    truncation: bool | None = None,
+    max_length: int | None = None,
+    add_special_tokens: bool | None = None,
 ) -> list[int]:
     """
     Backend-agnostic equivalent of HF's
@@ -137,12 +137,12 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
 
 
 def get_tokenizer(
-    tokenizer_name: Union[str, Path],
+    tokenizer_name: str | Path,
     *args,
     tokenizer_mode: str = "auto",
     trust_remote_code: bool = False,
-    revision: Optional[str] = None,
-    download_dir: Optional[str] = None,
+    revision: str | None = None,
+    download_dir: str | None = None,
     **kwargs,
 ) -> AnyTokenizer:
     """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index 2d64265abbf21..7421eb5348082 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -3,7 +3,7 @@
 
 import importlib
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
     from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
@@ -71,11 +71,11 @@ class TokenizerBase(ABC):
     @abstractmethod
     def __call__(
         self,
-        text: Union[str, list[str], list[int]],
-        text_pair: Optional[str] = None,
+        text: str | list[str] | list[int],
+        text_pair: str | None = None,
         add_special_tokens: bool = False,
         truncation: bool = False,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
     ):
         raise NotImplementedError()
 
@@ -92,7 +92,7 @@ class TokenizerBase(ABC):
         self,
         text: str,
         truncation: bool = False,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
     ) -> list[int]:
         raise NotImplementedError()
 
@@ -100,9 +100,9 @@ class TokenizerBase(ABC):
     def encode(
         self,
         text: str,
-        truncation: Optional[bool] = None,
-        max_length: Optional[int] = None,
-        add_special_tokens: Optional[bool] = None,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool | None = None,
     ) -> list[int]:
         raise NotImplementedError()
 
@@ -110,7 +110,7 @@ class TokenizerBase(ABC):
     def apply_chat_template(
         self,
         messages: list["ChatCompletionMessageParam"],
-        tools: Optional[list[dict[str, Any]]] = None,
+        tools: list[dict[str, Any]] | None = None,
         **kwargs,
     ) -> list[int]:
         raise NotImplementedError()
@@ -120,9 +120,7 @@ class TokenizerBase(ABC):
         raise NotImplementedError()
 
     @abstractmethod
-    def decode(
-        self, ids: Union[list[int], int], skip_special_tokens: bool = True
-    ) -> str:
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
         raise NotImplementedError()
 
     @abstractmethod
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index eae067fcfa344..6f710bf23360f 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, Any, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, cast
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer_base import TokenizerBase
@@ -90,10 +90,10 @@ def truncate_tool_call_ids(request: "MistralChatCompletionRequest"):
 
 def _prepare_apply_chat_template_tools_and_messages(
     messages: list["ChatCompletionMessageParam"],
-    tools: Optional[list[dict[str, Any]]] = None,
+    tools: list[dict[str, Any]] | None = None,
     continue_final_message: bool = False,
     add_generation_prompt: bool = False,
-) -> tuple[list["ChatCompletionMessageParam"], Optional[list[dict[str, Any]]]]:
+) -> tuple[list["ChatCompletionMessageParam"], list[dict[str, Any]] | None]:
     if add_generation_prompt and continue_final_message:
         raise ValueError(
             "Cannot set both `add_generation_prompt` and "
@@ -144,7 +144,7 @@ def validate_request_params(request: "ChatCompletionRequest"):
         raise ValueError("chat_template is not supported for Mistral tokenizers.")
 
 
-def _tekken_token_to_id(tokenizer: "Tekkenizer", t: Union[str, bytes]) -> int:
+def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
     from mistral_common.tokens.tokenizers.tekken import Tekkenizer
 
     assert isinstance(tokenizer, Tekkenizer), type(tokenizer)
@@ -197,7 +197,7 @@ class MistralTokenizer(TokenizerBase):
 
     @classmethod
     def from_pretrained(
-        cls, path_or_repo_id: str, *, revision: Optional[str] = None
+        cls, path_or_repo_id: str, *, revision: str | None = None
     ) -> "MistralTokenizer":
         from transformers.tokenization_mistral_common import (
             MistralCommonTokenizer as TransformersMistralTokenizer,
@@ -298,11 +298,11 @@ class MistralTokenizer(TokenizerBase):
 
     def __call__(
         self,
-        text: Union[str, list[str], list[int]],
-        text_pair: Optional[str] = None,
+        text: str | list[str] | list[int],
+        text_pair: str | None = None,
         add_special_tokens: bool = False,
         truncation: bool = False,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
     ):
         return self.transformers_tokenizer(
             text=text,
@@ -327,7 +327,7 @@ class MistralTokenizer(TokenizerBase):
         self,
         text: str,
         truncation: bool = False,
-        max_length: Optional[int] = None,
+        max_length: int | None = None,
     ) -> list[int]:
         # Mistral Tokenizers should not add special tokens
         return self.transformers_tokenizer.encode(
@@ -337,9 +337,9 @@ class MistralTokenizer(TokenizerBase):
     def encode(
         self,
         text: str,
-        truncation: Optional[bool] = None,
-        max_length: Optional[int] = None,
-        add_special_tokens: Optional[bool] = None,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool | None = None,
     ) -> list[int]:
         if add_special_tokens is not None:
             return self.transformers_tokenizer.encode(
@@ -359,7 +359,7 @@ class MistralTokenizer(TokenizerBase):
     def apply_chat_template(
         self,
         messages: list["ChatCompletionMessageParam"],
-        tools: Optional[list[dict[str, Any]]] = None,
+        tools: list[dict[str, Any]] | None = None,
         **kwargs,
     ) -> list[int]:
         add_generation_prompt = kwargs.pop("add_generation_prompt", False)
@@ -384,9 +384,7 @@ class MistralTokenizer(TokenizerBase):
             return_dict=False,
         )
 
-    def decode(
-        self, ids: Union[list[int], int], skip_special_tokens: bool = True
-    ) -> str:
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
         return self.transformers_tokenizer.decode(
             ids, skip_special_tokens=skip_special_tokens
         )
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 8952a0b197d69..b87414d79df0f 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -6,7 +6,7 @@ import struct
 from functools import cache
 from os import PathLike
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Any
 
 from vllm.envs import VLLM_MODEL_REDIRECT_PATH
 from vllm.logger import init_logger
@@ -18,7 +18,7 @@ def is_s3(model_or_path: str) -> bool:
     return model_or_path.lower().startswith("s3://")
 
 
-def check_gguf_file(model: Union[str, PathLike]) -> bool:
+def check_gguf_file(model: str | PathLike) -> bool:
     """Check if the file is a GGUF model."""
     model = Path(model)
     if not model.is_file():
@@ -38,8 +38,8 @@ def check_gguf_file(model: Union[str, PathLike]) -> bool:
 
 def modelscope_list_repo_files(
     repo_id: str,
-    revision: Optional[str] = None,
-    token: Union[str, bool, None] = None,
+    revision: str | None = None,
+    token: str | bool | None = None,
 ) -> list[str]:
     """List files in a modelscope repo."""
     from modelscope.hub.api import HubApi
@@ -57,7 +57,7 @@ def modelscope_list_repo_files(
     return files
 
 
-def _maybe_json_dict(path: Union[str, PathLike]) -> dict[str, str]:
+def _maybe_json_dict(path: str | PathLike) -> dict[str, str]:
     with open(path) as f:
         try:
             return json.loads(f.read())
@@ -65,7 +65,7 @@ def _maybe_json_dict(path: Union[str, PathLike]) -> dict[str, str]:
             return dict[str, str]()
 
 
-def _maybe_space_split_dict(path: Union[str, PathLike]) -> dict[str, str]:
+def _maybe_space_split_dict(path: str | PathLike) -> dict[str, str]:
     parsed_dict = dict[str, str]()
     with open(path) as f:
         for line in f.readlines():
@@ -104,7 +104,7 @@ def maybe_model_redirect(model: str) -> str:
     return model
 
 
-def parse_safetensors_file_metadata(path: Union[str, PathLike]) -> dict[str, Any]:
+def parse_safetensors_file_metadata(path: str | PathLike) -> dict[str, Any]:
     with open(path, "rb") as f:
         length_of_metadata = struct.unpack("<Q", f.read(8))[0]
         metadata = json.loads(f.read(length_of_metadata).decode("utf-8"))
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index ed470ebe88929..27a4f89e00456 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -10,7 +10,7 @@ import time
 from enum import Enum
 from pathlib import Path
 from threading import Thread
-from typing import Any, Optional, Union
+from typing import Any
 from uuid import uuid4
 
 import cpuinfo
@@ -32,7 +32,7 @@ _USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home, "do_not_track")
 _USAGE_STATS_ENABLED = None
 _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
 
-_GLOBAL_RUNTIME_DATA = dict[str, Union[str, int, bool]]()
+_GLOBAL_RUNTIME_DATA = dict[str, str | int | bool]()
 
 _USAGE_ENV_VARS_TO_COLLECT = [
     "VLLM_USE_MODELSCOPE",
@@ -46,7 +46,7 @@ _USAGE_ENV_VARS_TO_COLLECT = [
 ]
 
 
-def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None:
+def set_runtime_usage_data(key: str, value: str | int | bool) -> None:
     """Set global usage data that will be sent with every usage heartbeat."""
     _GLOBAL_RUNTIME_DATA[key] = value
 
@@ -131,33 +131,33 @@ class UsageMessage:
         self.uuid = str(uuid4())
 
         # Environment Information
-        self.provider: Optional[str] = None
-        self.num_cpu: Optional[int] = None
-        self.cpu_type: Optional[str] = None
-        self.cpu_family_model_stepping: Optional[str] = None
-        self.total_memory: Optional[int] = None
-        self.architecture: Optional[str] = None
-        self.platform: Optional[str] = None
-        self.cuda_runtime: Optional[str] = None
-        self.gpu_count: Optional[int] = None
-        self.gpu_type: Optional[str] = None
-        self.gpu_memory_per_device: Optional[int] = None
-        self.env_var_json: Optional[str] = None
+        self.provider: str | None = None
+        self.num_cpu: int | None = None
+        self.cpu_type: str | None = None
+        self.cpu_family_model_stepping: str | None = None
+        self.total_memory: int | None = None
+        self.architecture: str | None = None
+        self.platform: str | None = None
+        self.cuda_runtime: str | None = None
+        self.gpu_count: int | None = None
+        self.gpu_type: str | None = None
+        self.gpu_memory_per_device: int | None = None
+        self.env_var_json: str | None = None
 
         # vLLM Information
-        self.model_architecture: Optional[str] = None
-        self.vllm_version: Optional[str] = None
-        self.context: Optional[str] = None
+        self.model_architecture: str | None = None
+        self.vllm_version: str | None = None
+        self.context: str | None = None
 
         # Metadata
-        self.log_time: Optional[int] = None
-        self.source: Optional[str] = None
+        self.log_time: int | None = None
+        self.source: str | None = None
 
     def report_usage(
         self,
         model_architecture: str,
         usage_context: UsageContext,
-        extra_kvs: Optional[dict[str, Any]] = None,
+        extra_kvs: dict[str, Any] | None = None,
     ) -> None:
         t = Thread(
             target=self._report_usage_worker,
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 22c2a4b5362c2..c31c1ab0309c2 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import asyncio
 import concurrent
 import contextlib
@@ -46,6 +44,7 @@ from collections import UserDict, defaultdict
 from collections.abc import (
     AsyncGenerator,
     Awaitable,
+    Callable,
     Collection,
     Generator,
     Hashable,
@@ -62,12 +61,10 @@ from pathlib import Path
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     Generic,
     Literal,
     TextIO,
     TypeVar,
-    Union,
 )
 from urllib.parse import urlparse
 from uuid import uuid4
@@ -99,6 +96,12 @@ if TYPE_CHECKING:
 
     from vllm.config import ModelConfig, VllmConfig
     from vllm.sequence import IntermediateTensors
+else:
+    Namespace = object
+
+    ModelConfig = object
+    VllmConfig = object
+    IntermediateTensors = object
 
 logger = init_logger(__name__)
 
@@ -245,9 +248,7 @@ class AsyncMicrobatchTokenizer:
         self._queues: dict[
             tuple,
             asyncio.Queue[
-                Union[
-                    tuple[str, dict, asyncio.Future], tuple[list[int], asyncio.Future]
-                ]
+                tuple[str, dict, asyncio.Future] | tuple[list[int], asyncio.Future]
             ],
         ] = {}
         self._batcher_tasks: list[asyncio.Task] = []
@@ -274,7 +275,7 @@ class AsyncMicrobatchTokenizer:
     def _get_queue(
         self, loop: asyncio.AbstractEventLoop, key: tuple
     ) -> asyncio.Queue[
-        Union[tuple[str, dict, asyncio.Future], tuple[list[int], asyncio.Future]]
+        tuple[str, dict, asyncio.Future] | tuple[list[int], asyncio.Future]
     ]:
         """Get the request queue for the given operation key, creating a new
         queue and batcher task if needed."""
@@ -431,7 +432,7 @@ def cancel_task_threadsafe(task: Task):
         run_in_loop(task.get_loop(), task.cancel)
 
 
-def close_sockets(sockets: Sequence[Union[zmq.Socket, zmq.asyncio.Socket]]):
+def close_sockets(sockets: Sequence[zmq.Socket | zmq.asyncio.Socket]):
     for sock in sockets:
         if sock is not None:
             sock.close(linger=0)
@@ -767,8 +768,8 @@ def _generate_random_fp8(
 
 
 def get_kv_cache_torch_dtype(
-    cache_dtype: Union[str, torch.dtype] | None,
-    model_dtype: Union[str, torch.dtype] | None = None,
+    cache_dtype: str | torch.dtype | None,
+    model_dtype: str | torch.dtype | None = None,
 ) -> torch.dtype:
     if isinstance(cache_dtype, str):
         if cache_dtype == "auto":
@@ -795,8 +796,8 @@ def create_kv_caches_with_random_flash(
     num_layers: int,
     num_heads: int,
     head_size: int,
-    cache_dtype: Union[str, torch.dtype] | None,
-    model_dtype: Union[str, torch.dtype] | None = None,
+    cache_dtype: str | torch.dtype | None,
+    model_dtype: str | torch.dtype | None = None,
     seed: int | None = None,
     device: str | None = "cuda",
     cache_layout: str | None = "NHD",
@@ -837,8 +838,8 @@ def create_kv_caches_with_random(
     num_layers: int,
     num_heads: int,
     head_size: int,
-    cache_dtype: Union[str, torch.dtype] | None,
-    model_dtype: Union[str, torch.dtype] | None = None,
+    cache_dtype: str | torch.dtype | None,
+    model_dtype: str | torch.dtype | None = None,
     seed: int | None = None,
     device: str | None = "cuda",
 ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
@@ -952,7 +953,7 @@ def make_tensor_with_pad(
     dtype: torch.dtype,
     *,
     max_len: int | None = None,
-    device: Union[str, torch.device] | None = None,
+    device: str | torch.device | None = None,
     pin_memory: bool = False,
 ) -> torch.Tensor:
     """
@@ -974,7 +975,7 @@ def make_tensor_with_pad(
 def async_tensor_h2d(
     data: list,
     dtype: torch.dtype,
-    target_device: Union[str, torch.device],
+    target_device: str | torch.device,
     pin_memory: bool,
 ) -> torch.Tensor:
     """Asynchronously create a tensor and copy it from host to device."""
@@ -1041,7 +1042,7 @@ def as_list(maybe_list: Iterable[T]) -> list[T]:
     return maybe_list if isinstance(maybe_list, list) else list(maybe_list)
 
 
-def as_iter(obj: Union[T, Iterable[T]]) -> Iterable[T]:
+def as_iter(obj: T | Iterable[T]) -> Iterable[T]:
     if isinstance(obj, str) or not isinstance(obj, Iterable):
         return [obj]  # type: ignore[list-item]
     return obj
@@ -1050,7 +1051,7 @@ def as_iter(obj: Union[T, Iterable[T]]) -> Iterable[T]:
 # `collections` helpers
 def is_list_of(
     value: object,
-    typ: Union[type[T], tuple[type[T], ...]],
+    typ: type[T] | tuple[type[T], ...],
     *,
     check: Literal["first", "all"] = "first",
 ) -> TypeIs[list[T]]:
@@ -1266,7 +1267,7 @@ F = TypeVar("F", bound=Callable[..., Any])
 
 def deprecate_args(
     start_index: int,
-    is_deprecated: Union[bool, Callable[[], bool]] = True,
+    is_deprecated: bool | Callable[[], bool] = True,
     additional_message: str | None = None,
 ) -> Callable[[F], F]:
     if not callable(is_deprecated):
@@ -1306,7 +1307,7 @@ def deprecate_args(
 
 def deprecate_kwargs(
     *kws: str,
-    is_deprecated: Union[bool, Callable[[], bool]] = True,
+    is_deprecated: bool | Callable[[], bool] = True,
     additional_message: str | None = None,
 ) -> Callable[[F], F]:
     deprecated_kws = set(kws)
@@ -1892,7 +1893,7 @@ class FlexibleArgumentParser(ArgumentParser):
         # only expecting a flat dictionary of atomic types
         processed_args: list[str] = []
 
-        config: dict[str, Union[int, str]] = {}
+        config: dict[str, int | str] = {}
         try:
             with open(file_path) as config_file:
                 config = yaml.safe_load(config_file)
@@ -2149,10 +2150,11 @@ def weak_ref_tensor(tensor: Any) -> Any:
 
 
 def weak_ref_tensors(
-    tensors: Union[
-        torch.Tensor, list[torch.Tensor], tuple[torch.Tensor], IntermediateTensors
-    ],
-) -> Union[torch.Tensor, list[Any], tuple[Any], Any]:
+    tensors: torch.Tensor
+    | list[torch.Tensor]
+    | tuple[torch.Tensor]
+    | IntermediateTensors,
+) -> torch.Tensor | list[Any] | tuple[Any] | Any:
     """
     Convenience function to create weak references to tensors,
     for single tensor, list of tensors or tuple of tensors.
@@ -2183,7 +2185,7 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
     return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
 
 
-def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
+def import_from_path(module_name: str, file_path: str | os.PathLike):
     """
     Import a Python file according to its file path.
 
@@ -2584,7 +2586,7 @@ class MemorySnapshot:
         self.non_torch_memory = self.cuda_memory - self.torch_memory
         self.timestamp = time.time()
 
-    def __sub__(self, other: MemorySnapshot) -> MemorySnapshot:
+    def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
         return MemorySnapshot(
             torch_peak=self.torch_peak - other.torch_peak,
             free_memory=self.free_memory - other.free_memory,
@@ -2778,13 +2780,13 @@ def make_zmq_path(scheme: str, host: str, port: int | None = None) -> str:
 
 # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501
 def make_zmq_socket(
-    ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
+    ctx: zmq.asyncio.Context | zmq.Context,  # type: ignore[name-defined]
     path: str,
     socket_type: Any,
     bind: bool | None = None,
     identity: bytes | None = None,
     linger: int | None = None,
-) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
+) -> zmq.Socket | zmq.asyncio.Socket:  # type: ignore[name-defined]
     """Make a ZMQ socket with the proper bind/connect semantics."""
 
     mem = psutil.virtual_memory()
@@ -2950,7 +2952,7 @@ def bind_kv_cache(
 
 def run_method(
     obj: Any,
-    method: Union[str, bytes, Callable],
+    method: str | bytes | Callable,
     args: tuple[Any],
     kwargs: dict[str, Any],
 ) -> Any:
diff --git a/vllm/utils/cache.py b/vllm/utils/cache.py
index a57ef9b70ccc8..d5e08caa8a1ed 100644
--- a/vllm/utils/cache.py
+++ b/vllm/utils/cache.py
@@ -1,11 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 from collections import UserDict
-from collections.abc import Hashable, Iterator, KeysView, Mapping
+from collections.abc import Callable, Hashable, Iterator, KeysView, Mapping
 from types import MappingProxyType
-from typing import Callable, Generic, NamedTuple, TypeVar, Union, cast, overload
+from typing import Generic, NamedTuple, TypeVar, cast, overload
 
 import cachetools
 
@@ -43,7 +41,7 @@ class CacheInfo(NamedTuple):
 
         return self.hits / self.total
 
-    def __sub__(self, other: CacheInfo):
+    def __sub__(self, other: "CacheInfo"):
         return CacheInfo(
             hits=self.hits - other.hits,
             total=self.total - other.total,
@@ -129,12 +127,10 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
     def get(self, key: _K, /) -> _V | None: ...
 
     @overload
-    def get(self, key: _K, /, default: Union[_V, _T]) -> Union[_V, _T]: ...
+    def get(self, key: _K, /, default: _V | _T) -> _V | _T: ...
 
-    def get(
-        self, key: _K, /, default: Union[_V, _T] | None = None
-    ) -> Union[_V, _T] | None:
-        value: Union[_V, _T] | None
+    def get(self, key: _K, /, default: _V | _T | None = None) -> _V | _T | None:
+        value: _V | _T | None
         if key in self:
             value = self.__getitem__(key, update_info=False)  # type: ignore[call-arg]
 
@@ -149,12 +145,10 @@ class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
     def pop(self, key: _K) -> _V: ...
 
     @overload
-    def pop(self, key: _K, default: Union[_V, _T]) -> Union[_V, _T]: ...
+    def pop(self, key: _K, default: _V | _T) -> _V | _T: ...
 
-    def pop(
-        self, key: _K, default: Union[_V, _T] | None = None
-    ) -> Union[_V, _T] | None:
-        value: Union[_V, _T] | None
+    def pop(self, key: _K, default: _V | _T | None = None) -> _V | _T | None:
+        value: _V | _T | None
         if key not in self:
             return default
 
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 8f8f25f1302d6..39ffba3137df8 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -5,12 +5,11 @@
 Users of vLLM should always import **only** these wrappers.
 """
 
-from __future__ import annotations
-
 import functools
 import importlib
 import os
-from typing import Any, Callable, NoReturn
+from collections.abc import Callable
+from typing import Any, NoReturn
 
 import torch
 
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index ad8295f8f6893..24b80e389e838 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -5,15 +5,14 @@
 Users of vLLM should always import **only** these wrappers.
 """
 
-from __future__ import annotations
-
 import contextlib
 import functools
 import importlib
 import importlib.util
 import os
 import shutil
-from typing import Any, Callable, NoReturn
+from collections.abc import Callable
+from typing import Any, NoReturn
 
 import requests
 import torch
diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py
index e3b5b61dd3643..99c19c9db28e9 100644
--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@@ -5,7 +5,7 @@ import json
 import time
 from collections import Counter
 from contextlib import suppress
-from typing import Any, Optional
+from typing import Any
 
 from vllm.envs import VLLM_GC_DEBUG
 from vllm.logger import init_logger
@@ -21,7 +21,7 @@ class GCDebugConfig:
     - '{"top_objects":5}': enable GC debugger with top 5 collected objects
     """
 
-    def __init__(self, gc_debug_conf: Optional[str] = None) -> None:
+    def __init__(self, gc_debug_conf: str | None = None) -> None:
         self.enabled: bool = False
         self.top_objects: int = -1
 
diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py
index dcdc6ccb4c638..cde9aa6ff901c 100644
--- a/vllm/utils/jsontree.py
+++ b/vllm/utils/jsontree.py
@@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Helper functions to work with nested JSON structures."""
 
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from functools import reduce
-from typing import TYPE_CHECKING, Callable, TypeVar, Union, cast, overload
+from typing import TYPE_CHECKING, TypeAlias, TypeVar, cast, overload
 
 if TYPE_CHECKING:
     import torch
@@ -14,23 +14,20 @@ if TYPE_CHECKING:
 _T = TypeVar("_T")
 _U = TypeVar("_U")
 
-JSONTree = Union[
-    dict[str, "JSONTree[_T]"],
-    list["JSONTree[_T]"],
-    tuple["JSONTree[_T]", ...],
-    _T,
-]
+JSONTree: TypeAlias = (
+    dict[str, "JSONTree[_T]"] | list["JSONTree[_T]"] | tuple["JSONTree[_T]", ...] | _T
+)
 """A nested JSON structure where the leaves need not be JSON-serializable."""
 
-_JSONTree = Union[
-    dict[str, "JSONTree[_T]"],
-    list["JSONTree[_T]"],
-    tuple["JSONTree[_T]", ...],
-    dict[str, _T],
-    list[_T],
-    tuple[_T, ...],
-    _T,
-]
+_JSONTree: TypeAlias = (
+    dict[str, "JSONTree[_T]"]
+    | list["JSONTree[_T]"]
+    | tuple["JSONTree[_T]", ...]
+    | dict[str, _T]
+    | list[_T]
+    | tuple[_T, ...]
+    | _T
+)
 """
 Same as `JSONTree` but with additional `Union` members to satisfy overloads.
 """
@@ -58,22 +55,22 @@ def json_map_leaves(
 @overload
 def json_map_leaves(
     func: Callable[[_T], _U],
-    value: Union[_T, dict[str, _T]],
-) -> Union[_U, dict[str, _U]]: ...
+    value: _T | dict[str, _T],
+) -> _U | dict[str, _U]: ...
 
 
 @overload
 def json_map_leaves(
     func: Callable[[_T], _U],
-    value: Union[_T, list[_T]],
-) -> Union[_U, list[_U]]: ...
+    value: _T | list[_T],
+) -> _U | list[_U]: ...
 
 
 @overload
 def json_map_leaves(
     func: Callable[[_T], _U],
-    value: Union[_T, tuple[_T, ...]],
-) -> Union[_U, tuple[_U, ...]]: ...
+    value: _T | tuple[_T, ...],
+) -> _U | tuple[_U, ...]: ...
 
 
 @overload
@@ -85,8 +82,8 @@ def json_map_leaves(
 
 def json_map_leaves(
     func: Callable[[_T], _U],
-    value: Union["BatchedTensorInputs", _JSONTree[_T]],
-) -> Union["BatchedTensorInputs", _JSONTree[_U]]:
+    value: "BatchedTensorInputs" | _JSONTree[_T],
+) -> "BatchedTensorInputs" | _JSONTree[_U]:
     """Apply a function to each leaf in a nested JSON structure."""
     if isinstance(value, dict):
         return {
@@ -104,7 +101,7 @@ def json_map_leaves(
 @overload
 def json_reduce_leaves(
     func: Callable[[_T, _T], _T],
-    value: Union[_T, dict[str, _T]],
+    value: _T | dict[str, _T],
     /,
 ) -> _T: ...
 
@@ -112,7 +109,7 @@ def json_reduce_leaves(
 @overload
 def json_reduce_leaves(
     func: Callable[[_T, _T], _T],
-    value: Union[_T, list[_T]],
+    value: _T | list[_T],
     /,
 ) -> _T: ...
 
@@ -120,7 +117,7 @@ def json_reduce_leaves(
 @overload
 def json_reduce_leaves(
     func: Callable[[_T, _T], _T],
-    value: Union[_T, tuple[_T, ...]],
+    value: _T | tuple[_T, ...],
     /,
 ) -> _T: ...
 
@@ -143,11 +140,11 @@ def json_reduce_leaves(
 
 
 def json_reduce_leaves(
-    func: Callable[..., Union[_T, _U]],
+    func: Callable[..., _T | _U],
     value: _JSONTree[_T],
     initial: _U = cast(_U, ...),  # noqa: B008
     /,
-) -> Union[_T, _U]:
+) -> _T | _U:
     """
     Apply a function of two arguments cumulatively to each leaf in a
     nested JSON structure, from left to right, so as to reduce the
diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py
index e17676ccf7ef2..526dfd38bac46 100644
--- a/vllm/utils/tensor_schema.py
+++ b/vllm/utils/tensor_schema.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Annotated, Any, Optional, Union, get_args, get_origin, get_type_hints
+from types import UnionType
+from typing import Annotated, Any, Union, get_args, get_origin, get_type_hints
 
 import torch
 
@@ -12,16 +13,16 @@ logger = init_logger(__name__)
 class TensorShape:
     def __init__(
         self,
-        *dims: Union[int, str],
-        dynamic_dims: Optional[set[str]] = None,
+        *dims: int | str,
+        dynamic_dims: set[str] | None = None,
     ) -> None:
         super().__init__()
 
         self.dims = dims
         self.dynamic_dims = dynamic_dims if dynamic_dims else set()
 
-    def resolve(self, **bindings: int) -> tuple[Union[int, str], ...]:
-        resolved = list[Union[int, str]]()
+    def resolve(self, **bindings: int) -> tuple[int | str, ...]:
+        resolved = list[int | str]()
         for dim in self.dims:
             if isinstance(dim, str) and dim in bindings:
                 resolved.append(bindings[dim])
@@ -48,7 +49,7 @@ class TensorSchema:
         self,
         *,
         validate: bool = True,
-        resolve_bindings: Optional[dict[str, int]] = None,
+        resolve_bindings: dict[str, int] | None = None,
         **kwargs: Any,
     ) -> None:
         super().__init__()
@@ -71,7 +72,7 @@ class TensorSchema:
         self,
         actual: tuple[int, ...],
         reference: tuple[int, ...],
-        expected_shape: tuple[Union[int, str], ...],
+        expected_shape: tuple[int | str, ...],
         dynamic_dims: set[str],
     ) -> bool:
         if len(actual) != len(reference) or len(actual) > len(expected_shape):
@@ -100,7 +101,7 @@ class TensorSchema:
         self,
         value: object,
         field_name: str,
-        expected_shape: tuple[Union[int, str], ...],
+        expected_shape: tuple[int | str, ...],
         dynamic_dims: set[str],
         leading_idxs: tuple[int, ...] = (),
     ) -> tuple[int, ...]:
@@ -154,7 +155,7 @@ class TensorSchema:
     def _validate_tensor_shape_expected(
         self,
         actual_shape: tuple[int, ...],
-        expected_shape: tuple[Union[int, str], ...],
+        expected_shape: tuple[int | str, ...],
         field_name: str,
         shape_env: dict[str, int],
         dynamic_dims: set[str],
@@ -209,7 +210,8 @@ class TensorSchema:
                     actual_type = args[0]
 
                 # Check arg was provided as Union
-                if get_origin(actual_type) is Union:
+                if get_origin(actual_type) in {Union, UnionType}:
+                    # Union for Union[X, Y] and UnionType for X | Y
                     args = get_args(actual_type)
                     # Skip validation when Union contains None
                     if type(None) in args:
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 6e27e93c91153..211eefdb6c110 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -110,7 +110,7 @@ class TorchSDPAMetadata(AttentionMetadata):
     """Metadata for PagedAttention."""
     # (batch_size,). The length of sequences (entire tokens seen so far) per
     # sequence.
-    decode_seq_lens_tensor: Optional[torch.Tensor]
+    decode_seq_lens_tensor: torch.Tensor | None
     # Maximum sequence length in the batch. 0 if it is prefill-only batch.
     decode_max_seq_len: int
     # (batch_size, max_blocks_per_seq).
@@ -119,39 +119,39 @@ class TorchSDPAMetadata(AttentionMetadata):
     # in the kv cache. Each block can contain up to block_size tokens.
     # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
     # captured.
-    decode_block_tables: Optional[torch.Tensor]
+    decode_block_tables: torch.Tensor | None
     """Metadata for TorchSDPABackend.
     """
     # Currently, input sequences can only contain all prompts
     # or all decoding. True if all sequences are prompts.
     chunked_prefill: bool
-    seq_lens: Optional[list[int]] = None  # For non-chunked prefill
+    seq_lens: list[int] | None = None  # For non-chunked prefill
 
     # For chunked prefill only
-    max_query_len: Optional[int] = None
-    prefill_max_seq_len: Optional[int] = None
-    prefill_query_start_loc: Optional[torch.Tensor] = None
-    prefill_seq_start_loc: Optional[torch.Tensor] = None
-    prefill_block_tables: Optional[torch.Tensor] = None
+    max_query_len: int | None = None
+    prefill_max_seq_len: int | None = None
+    prefill_query_start_loc: torch.Tensor | None = None
+    prefill_seq_start_loc: torch.Tensor | None = None
+    prefill_block_tables: torch.Tensor | None = None
 
     # For V1 logits index only
-    query_start_loc: Optional[torch.Tensor] = None
+    query_start_loc: torch.Tensor | None = None
 
     # Begin encoder attn & enc/dec cross-attn fields...
     # Encoder sequence lengths representation
-    encoder_seq_lens: Optional[list[int]] = None
-    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+    encoder_seq_lens: list[int] | None = None
+    encoder_seq_lens_tensor: torch.Tensor | None = None
 
     # Maximum sequence length among encoder sequences
-    max_encoder_seq_len: Optional[int] = None
+    max_encoder_seq_len: int | None = None
 
     # Number of tokens input to encoder
-    num_encoder_tokens: Optional[int] = None
+    num_encoder_tokens: int | None = None
 
     # Cross-attention memory-mapping data structures: slot mapping
     # and block tables
-    cross_slot_mapping: Optional[torch.Tensor] = None
-    cross_block_tables: Optional[torch.Tensor] = None
+    cross_slot_mapping: torch.Tensor | None = None
+    cross_block_tables: torch.Tensor | None = None
 
     def __post_init__(self):
         # Set during the execution of the first attention op.
@@ -159,9 +159,9 @@ class TorchSDPAMetadata(AttentionMetadata):
         # when alibi slopes is used. It is because of the limitation
         # from xformer API.
         # will not appear in the __repr__ and __init__
-        self.attn_bias: Optional[list[torch.Tensor]] = None
-        self.encoder_attn_bias: Optional[list[torch.Tensor]] = None
-        self.cross_attn_bias: Optional[list[torch.Tensor]] = None
+        self.attn_bias: list[torch.Tensor] | None = None
+        self.encoder_attn_bias: list[torch.Tensor] | None = None
+        self.cross_attn_bias: list[torch.Tensor] | None = None
 
     @property
     def is_all_encoder_attn_metadata_set(self):
@@ -237,7 +237,7 @@ class TorchSDPAMetadata(AttentionMetadata):
     def get_attn_bias(
         self,
         attn_type: str,
-    ) -> Optional[list[torch.Tensor]]:
+    ) -> list[torch.Tensor] | None:
         """
         Extract appropriate attention bias from attention metadata
         according to attention type.
@@ -439,12 +439,12 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
+        kv_sharing_target_layer_name: str | None = None,
     ) -> None:
         if kv_sharing_target_layer_name is not None:
             raise NotImplementedError("KV sharing is not supported in V0.")
@@ -484,9 +484,9 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: TorchSDPAMetadata,  # type: ignore
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with torch SDPA and PagedAttention.
 
@@ -737,7 +737,7 @@ def _make_alibi_bias(
 
 def _make_sliding_window_bias(
     seq_lens: list[int],
-    window_size: Optional[int],
+    window_size: int | None,
     dtype: torch.dtype,
 ) -> list[torch.Tensor]:
     attn_biases: list[torch.Tensor] = []
@@ -824,7 +824,7 @@ class _PagedAttention:
         kv_cache_dtype: str,
         num_kv_heads: int,
         scale: float,
-        alibi_slopes: Optional[torch.Tensor],
+        alibi_slopes: torch.Tensor | None,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
         *args,
@@ -907,7 +907,7 @@ class _IPEXPagedAttention(_PagedAttention):
         kv_cache_dtype: str,
         num_kv_heads: int,
         scale: float,
-        alibi_slopes: Optional[torch.Tensor],
+        alibi_slopes: torch.Tensor | None,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
         *args,
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a71e51471905a..fb5ff499de2cd 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -3,7 +3,6 @@
 """Attention layer with FlashAttention."""
 
 from dataclasses import dataclass
-from typing import Optional, Union
 
 import numpy as np
 import torch
@@ -59,7 +58,7 @@ class FlashAttentionBackend(AttentionBackend):
         return [32, 64, 96, 128, 160, 192, 224, 256]
 
     @staticmethod
-    def get_supported_kernel_block_size() -> list[Union[int, MultipleOf]]:
+    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
         return [MultipleOf(16)]
 
     @classmethod
@@ -144,13 +143,13 @@ class FlashAttentionMetadata:
     # For cascade attention.
     use_cascade: bool
     common_prefix_len: int
-    cu_prefix_query_lens: Optional[torch.Tensor]
-    prefix_kv_lens: Optional[torch.Tensor]
-    suffix_kv_lens: Optional[torch.Tensor]
+    cu_prefix_query_lens: torch.Tensor | None
+    prefix_kv_lens: torch.Tensor | None
+    suffix_kv_lens: torch.Tensor | None
 
     # Optional aot scheduling
-    scheduler_metadata: Optional[torch.Tensor] = None
-    prefix_scheduler_metadata: Optional[torch.Tensor] = None
+    scheduler_metadata: torch.Tensor | None = None
+    prefix_scheduler_metadata: torch.Tensor | None = None
     max_num_splits: int = 0
 
     causal: bool = True
@@ -158,9 +157,9 @@ class FlashAttentionMetadata:
 
 def _get_sliding_window_configs(
     vllm_config: VllmConfig,
-) -> set[Optional[tuple[int, int]]]:
+) -> set[tuple[int, int] | None]:
     """Get the set of all sliding window configs used in the model."""
-    sliding_window_configs: set[Optional[tuple[int, int]]] = set()
+    sliding_window_configs: set[tuple[int, int] | None] = set()
     layers = get_layers_from_vllm_config(vllm_config, Attention)
     for layer in layers.values():
         assert isinstance(layer.impl, FlashAttentionImpl)
@@ -242,7 +241,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
 
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
-        self.aot_sliding_window: Optional[tuple[int, int]] = None
+        self.aot_sliding_window: tuple[int, int] | None = None
 
     def build(
         self,
@@ -403,13 +402,13 @@ class FlashAttentionImpl(AttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        sinks: Optional[torch.Tensor] = None,
+        kv_sharing_target_layer_name: str | None = None,
+        sinks: torch.Tensor | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -460,9 +459,9 @@ class FlashAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
@@ -762,18 +761,18 @@ def cascade_attention(
     suffix_kv_lens: torch.Tensor,
     max_kv_len: int,
     softmax_scale: float,
-    alibi_slopes: Optional[torch.Tensor],
+    alibi_slopes: torch.Tensor | None,
     sliding_window: tuple[int, int],
     logits_soft_cap: float,
     block_table: torch.Tensor,
     common_prefix_len: int,
     fa_version: int,
-    prefix_scheduler_metadata: Optional[torch.Tensor] = None,
-    suffix_scheduler_metadata: Optional[torch.Tensor] = None,
-    q_descale: Optional[torch.Tensor] = None,
-    k_descale: Optional[torch.Tensor] = None,
-    v_descale: Optional[torch.Tensor] = None,
-    s_aux: Optional[torch.Tensor] = None,
+    prefix_scheduler_metadata: torch.Tensor | None = None,
+    suffix_scheduler_metadata: torch.Tensor | None = None,
+    q_descale: torch.Tensor | None = None,
+    k_descale: torch.Tensor | None = None,
+    v_descale: torch.Tensor | None = None,
+    s_aux: torch.Tensor | None = None,
 ) -> torch.Tensor:
     assert alibi_slopes is None, "Cascade attention does not support ALiBi."
     # TODO: Support sliding window.
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 15fd48ca54aa1..f0bbf090060c1 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -2,10 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashInfer."""
 
-from __future__ import annotations
-
 from dataclasses import dataclass
-from typing import ClassVar, Union
+from typing import ClassVar
 
 import numpy as np
 import torch
@@ -167,7 +165,7 @@ class FlashInferBackend(AttentionBackend):
         return [64, 128, 256]
 
     @staticmethod
-    def get_supported_kernel_block_size() -> list[Union[int, MultipleOf]]:
+    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
         # Note: Not sure for all platforms,
         # but on Blackwell, only support a page size of
         # 16, 32, 64
@@ -190,15 +188,15 @@ class FlashInferBackend(AttentionBackend):
         return "FLASHINFER"
 
     @staticmethod
-    def get_impl_cls() -> type[FlashInferImpl]:
+    def get_impl_cls() -> type["FlashInferImpl"]:
         return FlashInferImpl
 
     @staticmethod
-    def get_metadata_cls() -> type[FlashInferMetadata]:
+    def get_metadata_cls() -> type["FlashInferMetadata"]:
         return FlashInferMetadata
 
     @staticmethod
-    def get_builder_cls() -> type[FlashInferMetadataBuilder]:
+    def get_builder_cls() -> type["FlashInferMetadataBuilder"]:
         return FlashInferMetadataBuilder
 
     @staticmethod
@@ -1116,9 +1114,9 @@ def fast_plan_decode(
     pos_encoding_mode: str = "NONE",
     window_left: int = -1,
     logits_soft_cap: float | None = None,
-    q_data_type: Union[str, torch.dtype] | None = "float16",
-    kv_data_type: Union[str, torch.dtype] | None = None,
-    data_type: Union[str, torch.dtype] | None = None,
+    q_data_type: str | torch.dtype | None = "float16",
+    kv_data_type: str | torch.dtype | None = None,
+    data_type: str | torch.dtype | None = None,
     sm_scale: float | None = None,
     rope_scale: float | None = None,
     rope_theta: float | None = None,
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 7775445ae773e..2595851e5042d 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -3,7 +3,6 @@
 """Attention layer with FlexAttention."""
 
 from dataclasses import dataclass
-from typing import Optional, Union
 
 import torch
 import torch._dynamo.decorators
@@ -282,9 +281,9 @@ class FlexAttentionMetadata:
 
     use_cascade: bool
     common_prefix_len: int
-    cu_prefix_query_lens: Optional[torch.Tensor]
-    prefix_kv_lens: Optional[torch.Tensor]
-    suffix_kv_lens: Optional[torch.Tensor]
+    cu_prefix_query_lens: torch.Tensor | None
+    prefix_kv_lens: torch.Tensor | None
+    suffix_kv_lens: torch.Tensor | None
 
     # Block info
     total_cache_tokens: int
@@ -300,15 +299,15 @@ class FlexAttentionMetadata:
 
     # Flex Metadata
     num_blocks = 0
-    block_mask: Optional[BlockMask] = None
-    score_mod: Optional[_score_mod_signature] = None
+    block_mask: BlockMask | None = None
+    score_mod: _score_mod_signature | None = None
     logical_mask_mod: _mask_mod_signature = causal_mask_mod
-    doc_ids: Optional[torch.Tensor] = None
+    doc_ids: torch.Tensor | None = None
     direct_build: bool = True
     q_block_size: int = 16
     kv_block_size: int = 16
-    transformed_score_mod: Optional[_score_mod_signature] = None
-    sliding_window: Optional[int] = None
+    transformed_score_mod: _score_mod_signature | None = None
+    sliding_window: int | None = None
 
     def _convert_physical_to_logical(
         self,
@@ -443,7 +442,7 @@ class FlexAttentionMetadata:
             mask_mod = and_masks(mask_mod, sliding_window_mask_mod)
         return mask_mod
 
-    def get_transformed_score_mod(self) -> Optional[_score_mod_signature]:
+    def get_transformed_score_mod(self) -> _score_mod_signature | None:
         """Creates the transformed score_mod function for FlexAttention.
 
         This function wraps the user's score_mod to handle physical-to-logical
@@ -669,9 +668,9 @@ class FlexAttentionMetadataBuilder(AttentionMetadataBuilder[FlexAttentionMetadat
 
 
 class FlexAttentionImpl(AttentionImpl):
-    sliding_window: Optional[int]
-    alibi_slopes: Optional[torch.Tensor]
-    logits_soft_cap: Optional[float]
+    sliding_window: int | None
+    alibi_slopes: torch.Tensor | None
+    logits_soft_cap: float | None
 
     def __init__(
         self,
@@ -679,12 +678,12 @@ class FlexAttentionImpl(AttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
+        kv_sharing_target_layer_name: str | None = None,
         **kwargs,
     ) -> None:
         self.num_heads = num_heads
@@ -742,9 +741,9 @@ class FlexAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlexAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with FLexAttention.
 
@@ -860,8 +859,8 @@ class FlexAttentionImpl(AttentionImpl):
 
 def get_kernel_options(
     query, block_m, block_n, use_direct_build: bool
-) -> dict[str, Union[int, bool]]:
-    kernel_options: dict[str, Union[int, bool]] = {
+) -> dict[str, int | bool]:
+    kernel_options: dict[str, int | bool] = {
         "FORCE_USE_FLEX_ATTENTION": True,
     }
     if vllm_kernel_override_batch_invariant():
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index 21fc2ab72768c..1deda1ccd78a4 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -3,7 +3,6 @@
 """Backend for GatedDeltaNet attention."""
 
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -36,29 +35,27 @@ class GDNAttentionMetadata:
     num_spec_decode_tokens: int
     num_actual_tokens: int
 
-    has_initial_state: Optional[torch.Tensor] = None
+    has_initial_state: torch.Tensor | None = None
 
-    spec_query_start_loc: Optional[torch.Tensor] = (
-        None  # shape: [num_spec_decodes + 1,]
-    )
-    non_spec_query_start_loc: Optional[torch.Tensor] = (
+    spec_query_start_loc: torch.Tensor | None = None  # shape: [num_spec_decodes + 1,]
+    non_spec_query_start_loc: torch.Tensor | None = (
         None  # shape: [batch - num_spec_decodes + 1,]
     )
 
-    spec_state_indices_tensor: Optional[torch.Tensor] = None  # shape: [batch, num_spec]
-    non_spec_state_indices_tensor: Optional[torch.Tensor] = (
+    spec_state_indices_tensor: torch.Tensor | None = None  # shape: [batch, num_spec]
+    non_spec_state_indices_tensor: torch.Tensor | None = (
         None  # shape: [batch - num_spec_decodes,]
     )
-    spec_sequence_masks: Optional[torch.Tensor] = None  # shape: [batch,]
-    spec_token_masks: Optional[torch.Tensor] = (
+    spec_sequence_masks: torch.Tensor | None = None  # shape: [batch,]
+    spec_token_masks: torch.Tensor | None = (
         None  # shape: [num_prefill_tokens + num_decode_tokens,]
     )
-    num_accepted_tokens: Optional[torch.Tensor] = None  # shape: [batch,]
+    num_accepted_tokens: torch.Tensor | None = None  # shape: [batch,]
 
     # The following attributes are for triton implementation of causal_conv1d
-    nums_dict: Optional[dict] = None
-    batch_ptr: Optional[torch.Tensor] = None
-    token_chunk_offset_ptr: Optional[torch.Tensor] = None
+    nums_dict: dict | None = None
+    batch_ptr: torch.Tensor | None = None
+    token_chunk_offset_ptr: torch.Tensor | None = None
 
 
 class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]):
@@ -133,8 +130,8 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
         self,
         common_prefix_len: int,
         common_attn_metadata: CommonAttentionMetadata,
-        num_accepted_tokens: Optional[torch.Tensor] = None,
-        num_decode_draft_tokens_cpu: Optional[torch.Tensor] = None,
+        num_accepted_tokens: torch.Tensor | None = None,
+        num_decode_draft_tokens_cpu: torch.Tensor | None = None,
         fast_build: bool = False,
     ) -> GDNAttentionMetadata:
         m = common_attn_metadata
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index e305cb2d87029..30c63e0ded8e7 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -26,7 +25,7 @@ class Mamba1AttentionMetadata:
     query_start_loc: torch.Tensor
     context_lens_tensor: torch.Tensor
     state_indices_tensor: torch.Tensor
-    has_initial_states: Optional[torch.Tensor]
+    has_initial_states: torch.Tensor | None
     num_prefills: int
     num_prefill_tokens: int
     num_decodes: int
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index 10f09442d82e2..7ca8501a8a6fb 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -108,18 +107,18 @@ class Mamba2AttentionMetadata:
 
     # The following tensors only contain prefill requests and will be None if
     # the batch has no prefill request.
-    has_initial_states_p: Optional[torch.Tensor]
-    seq_idx_p: Optional[torch.Tensor]
+    has_initial_states_p: torch.Tensor | None
+    seq_idx_p: torch.Tensor | None
 
     # cu_chunk_seqlen_p is a tensor of shape (nchunks+1,) that contains, for
     # each chunk, its offests into the varlen sequence dimension. It is defined
     # such that the i-th chunk contains tokens from cu_chunk_seqlen_p[i] to
     # cu_chunk_seqlen_p[i+1].
-    cu_chunk_seqlen_p: Optional[torch.Tensor]
+    cu_chunk_seqlen_p: torch.Tensor | None
 
     # last_chunk_indices_p is a tensor of shape (batch,) that contains the
     # index of the last chunk for every sequence in the (prefill) batch.
-    last_chunk_indices_p: Optional[torch.Tensor]
+    last_chunk_indices_p: torch.Tensor | None
 
     state_indices_tensor: torch.Tensor  # shape: [batch,]
     block_idx_last_scheduled_token: torch.Tensor  # shape: [batch,]
@@ -128,9 +127,9 @@ class Mamba2AttentionMetadata:
     num_computed_tokens_p: torch.Tensor  # shape: [batch,]
 
     # The following attributes are for triton implementation of causal_conv1d
-    nums_dict: Optional[dict] = None
-    batch_ptr: Optional[torch.Tensor] = None
-    token_chunk_offset_ptr: Optional[torch.Tensor] = None
+    nums_dict: dict | None = None
+    batch_ptr: torch.Tensor | None = None
+    token_chunk_offset_ptr: torch.Tensor | None = None
 
 
 class Mamba2AttentionMetadataBuilder(
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index af396c2b41035..da56b5c9d3d22 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -190,7 +190,7 @@ return curr_o @ W_O
 import functools
 from abc import abstractmethod
 from dataclasses import dataclass, field
-from typing import ClassVar, Generic, Optional, TypeVar, Union
+from typing import ClassVar, Generic, TypeVar
 
 import torch
 from tqdm import tqdm
@@ -243,6 +243,8 @@ try:
 
     flashinfer_available = True
 except ImportError:
+    BatchPrefillWithRaggedKVCacheWrapper = object
+
     flashinfer_available = False
 
 
@@ -337,22 +339,22 @@ class MLACommonPrefillMetadata:
         workspace: torch.Tensor
 
         # for mla DCP
-        cp_chunk_seq_lens: Optional[list[list[int]]] = None
-        origin_context_lens: Optional[list[int]] = None
-        cp_cu_seq_lens: Optional[torch.Tensor] = None
-        chunk_size: Optional[int] = None
-        cu_seq_lens_lst: Optional[list[list[int]]] = None
+        cp_chunk_seq_lens: list[list[int]] | None = None
+        origin_context_lens: list[int] | None = None
+        cp_cu_seq_lens: torch.Tensor | None = None
+        chunk_size: int | None = None
+        cu_seq_lens_lst: list[list[int]] | None = None
 
     block_table: torch.Tensor
     query_start_loc: torch.Tensor
     max_query_len: int
-    chunked_context: Optional[ChunkedContextMetadata] = None
+    chunked_context: ChunkedContextMetadata | None = None
 
 
 @dataclass
 class FlashInferPrefillMetadata(MLACommonPrefillMetadata):
-    prefill_main: Optional["BatchPrefillWithRaggedKVCacheWrapper"] = None
-    prefill_chunks: list["BatchPrefillWithRaggedKVCacheWrapper"] = field(
+    prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None
+    prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = field(
         default_factory=list
     )
 
@@ -362,15 +364,15 @@ class CudnnPrefillMetadata(MLACommonPrefillMetadata):
     class ChunkedContextMetadata(MLACommonPrefillMetadata.ChunkedContextMetadata):
         seq_lens: torch.Tensor
 
-    query_seq_lens: Optional[torch.Tensor] = None
-    cudnn_workspace: Optional[torch.Tensor] = None
+    query_seq_lens: torch.Tensor | None = None
+    cudnn_workspace: torch.Tensor | None = None
 
 
 @dataclass
 class MLACommonDecodeMetadata:
     block_table: torch.Tensor
     seq_lens: torch.Tensor
-    dcp_tot_seq_lens: Optional[torch.Tensor]
+    dcp_tot_seq_lens: torch.Tensor | None
 
 
 D = TypeVar("D", bound=MLACommonDecodeMetadata)
@@ -407,12 +409,15 @@ class MLACommonMetadata(Generic[D]):
     num_prefills: int
 
     # The dimension of the attention heads
-    head_dim: Optional[int] = None
+    head_dim: int | None = None
 
-    decode: Optional[D] = None
-    prefill: Optional[
-        Union[MLACommonPrefillMetadata, FlashInferPrefillMetadata, CudnnPrefillMetadata]
-    ] = None
+    decode: D | None = None
+    prefill: (
+        MLACommonPrefillMetadata
+        | FlashInferPrefillMetadata
+        | CudnnPrefillMetadata
+        | None
+    ) = None
 
     def __post_init__(self):
         if self.head_dim is not None:
@@ -508,7 +513,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         layer_names: list[str],
         vllm_config: VllmConfig,
         device: torch.device,
-        metadata_cls: Optional[type[M]] = None,
+        metadata_cls: type[M] | None = None,
     ):
         self.metadata_cls = (
             metadata_cls if metadata_cls is not None else MLACommonMetadata
@@ -580,7 +585,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                 FLASHINFER_WORKSPACE_BUFFER_SIZE, dtype=torch.uint8, device=device
             )
 
-            self._fi_prefill_main: Optional[BatchPrefillWithRaggedKVCacheWrapper] = None
+            self._fi_prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None
             self._fi_prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = []
 
             self._global_hyperparameters = infer_global_hyperparameters(
@@ -683,7 +688,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
-        dcp_tot_seq_lens_device: Optional[torch.Tensor],
+        dcp_tot_seq_lens_device: torch.Tensor | None,
     ) -> MLACommonDecodeMetadata:
         return MLACommonDecodeMetadata(
             block_table=block_table_tensor,
@@ -1023,14 +1028,14 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float],
+        logits_soft_cap: float | None,
         attn_type: str,
-        kv_sharing_target_layer_name: Optional[str],
+        kv_sharing_target_layer_name: str | None,
         # MLA Specific Arguments
-        q_lora_rank: Optional[int],
+        q_lora_rank: int | None,
         kv_lora_rank: int,
         qk_nope_head_dim: int,
         qk_rope_head_dim: int,
@@ -1038,7 +1043,7 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
         v_head_dim: int,
         kv_b_proj: ColumnParallelLinear,
         indexer=None,
-        q_pad_num_heads: Optional[int] = None,
+        q_pad_num_heads: int | None = None,
     ) -> None:
         if kv_sharing_target_layer_name is not None:
             raise NotImplementedError("KV sharing is not supported for MLA")
@@ -1226,7 +1231,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
                 and current_platform.get_device_capability()[0] == 9
             )
 
-        self.dcp_world_size: Optional[int] = None
+        self.dcp_world_size: int | None = None
 
         self.chunked_prefill_workspace_size = (
             MLACommonMetadataBuilder.determine_chunked_prefill_workspace_size(
@@ -1710,11 +1715,11 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
     @abstractmethod
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: M,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         raise NotImplementedError
 
     def forward(
@@ -1725,9 +1730,9 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
         k_pe: torch.Tensor,  # value in unified attn
         kv_cache: torch.Tensor,
         attn_metadata: M,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         assert output is not None, "Output tensor must be provided."
 
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 11e06cc6daac7..bd52de07d2739 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import ClassVar, Optional, Union
+from typing import ClassVar
 
 import torch
 
@@ -46,7 +46,7 @@ class CutlassMLABackend(MLACommonBackend):
         return CutlassMLAMetadataBuilder
 
     @staticmethod
-    def get_supported_kernel_block_size() -> list[Union[int, MultipleOf]]:
+    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
         return [128]
 
 
@@ -95,12 +95,12 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float],
+        logits_soft_cap: float | None,
         attn_type: str,
-        kv_sharing_target_layer_name: Optional[str],
+        kv_sharing_target_layer_name: str | None,
         # MLA Specific Arguments
         **mla_args,
     ) -> None:
@@ -232,11 +232,11 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
 
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index c043990ffcc61..6e1586969fd4a 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional, Union
+from typing import ClassVar
 
 import torch
 
@@ -55,7 +55,7 @@ class FlashAttnMLADecodeMetadata(MLACommonDecodeMetadata):
     query_start_loc: torch.Tensor
     max_query_len: int
     max_seq_len: int
-    scheduler_metadata: Optional[torch.Tensor] = None
+    scheduler_metadata: torch.Tensor | None = None
     max_num_splits: int = 0
 
 
@@ -135,7 +135,7 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
-        dcp_tot_seq_lens_device: Optional[torch.Tensor],
+        dcp_tot_seq_lens_device: torch.Tensor | None,
     ) -> FlashAttnMLADecodeMetadata:
         query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
         max_query_len = query_lens_cpu.max().item()
@@ -195,12 +195,12 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float],
+        logits_soft_cap: float | None,
         attn_type: str,
-        kv_sharing_target_layer_name: Optional[str],
+        kv_sharing_target_layer_name: str | None,
         # MLA Specific Arguments
         **mla_args,
     ) -> None:
@@ -242,11 +242,11 @@ class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]):
 
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: FlashAttnMLAMetadata,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py
index 206f96ea366a4..add1c8dc972f5 100644
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import ClassVar, Optional, Union
+from typing import ClassVar
 
 import torch
 from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
@@ -57,12 +57,12 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float],
+        logits_soft_cap: float | None,
         attn_type: str,
-        kv_sharing_target_layer_name: Optional[str],
+        kv_sharing_target_layer_name: str | None,
         # MLA Specific Arguments
         **mla_args,
     ) -> None:
@@ -96,16 +96,16 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
             )
 
         self._workspace_buffer = g_fi_workspace
-        self.bmm1_scale: Optional[float] = None
-        self.bmm2_scale: Optional[float] = None
+        self.bmm1_scale: float | None = None
+        self.bmm2_scale: float | None = None
 
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index e0f4a7f0382b3..d8d1ab2c6cc0c 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional, Union
+from typing import ClassVar
 
 import torch
 
@@ -45,7 +45,7 @@ class FlashMLABackend(MLACommonBackend):
         return FlashMLAImpl
 
     @staticmethod
-    def get_supported_kernel_block_size() -> list[Union[int, MultipleOf]]:
+    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
         return [64]
 
 
@@ -106,7 +106,7 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
-        dcp_tot_seq_lens_device: Optional[torch.Tensor],
+        dcp_tot_seq_lens_device: torch.Tensor | None,
     ) -> FlashMLADecodeMetadata:
         tile_scheduler_metadata, num_splits = get_mla_metadata(
             seq_lens_device,
@@ -160,12 +160,12 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float],
+        logits_soft_cap: float | None,
         attn_type: str,
-        kv_sharing_target_layer_name: Optional[str],
+        kv_sharing_target_layer_name: str | None,
         # MLA Specific Arguments
         **mla_args,
     ) -> None:
@@ -203,11 +203,11 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
 
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: FlashMLAMetadata,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         # TODO: (zyongye) decode function for mla here
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index 144e46d5e9537..141436e66c32c 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -110,12 +110,12 @@ class FlashMLASparseMetadata:
 
     @dataclass
     class FP8KernelMetadata:
-        scheduler_metadata: Optional[torch.Tensor]
+        scheduler_metadata: torch.Tensor | None
         num_splits: torch.Tensor
         dummy_block_table: torch.Tensor
         cache_lens: torch.Tensor
 
-    fp8_extra_metadata: Optional[FP8KernelMetadata] = None
+    fp8_extra_metadata: FP8KernelMetadata | None = None
 
 
 @triton.jit
@@ -373,14 +373,14 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float],
+        logits_soft_cap: float | None,
         attn_type: str,
-        kv_sharing_target_layer_name: Optional[str],
+        kv_sharing_target_layer_name: str | None,
         # MLA Specific Arguments
-        topk_indice_buffer: Optional[torch.Tensor] = None,
+        topk_indice_buffer: torch.Tensor | None = None,
         indexer: Optional["Indexer"] = None,
         **mla_args,
     ) -> None:
@@ -466,9 +466,9 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]):
         k_pe: torch.Tensor,  # value in unified attn
         kv_cache: torch.Tensor,
         attn_metadata: FlashMLASparseMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         # NOTE(lucas): for the sparse FlashMLA kernels the kernels want to use
         # MQA 576/512 approach for both prefill and decode
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index b8a232c8447bb..49009a939d0b5 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import ClassVar, Optional, Union
+from typing import ClassVar
 
 import torch
 
@@ -52,7 +52,7 @@ class DeepseekV32IndexerBackend(AttentionBackend):
         return (0, 1, 2)
 
     @classmethod
-    def get_supported_kernel_block_size(cls) -> list[Union[int, MultipleOf]]:
+    def get_supported_kernel_block_size(cls) -> list[int | MultipleOf]:
         return [64]
 
 
@@ -105,8 +105,8 @@ class DeepseekV32IndexerMetadata:
     num_prefills: int
     num_prefill_tokens: int
 
-    decode: Optional[DeepSeekV32IndexerDecodeMetadata] = None
-    prefill: Optional[DeepseekV32IndexerPrefillMetadata] = None
+    decode: DeepSeekV32IndexerDecodeMetadata | None = None
+    prefill: DeepseekV32IndexerPrefillMetadata | None = None
 
 
 # TODO (zyongye) optimize this, this is now vibe coded
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 195b05e0a301f..d935c02243bd9 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional, Union
+from typing import ClassVar
 
 import torch
 
@@ -47,14 +47,14 @@ class AiterMLABackend(MLACommonBackend):
 @dataclass
 class AiterMLADecodeMetadata(MLACommonDecodeMetadata):
     # The indptr of the paged kv cache, shape: [batch_size + 1]
-    paged_kv_indptr: Optional[torch.Tensor] = None
+    paged_kv_indptr: torch.Tensor | None = None
     # The page indices of the paged kv cache
-    paged_kv_indices: Optional[torch.Tensor] = None
+    paged_kv_indices: torch.Tensor | None = None
     # The number of entries in the last page of each request in
     # the paged kv cache, shape: [batch_size]
-    paged_kv_last_page_len: Optional[torch.Tensor] = None
+    paged_kv_last_page_len: torch.Tensor | None = None
     # The query indptr, shape : [num_decode + 1]
-    qo_indptr: Optional[torch.Tensor] = None
+    qo_indptr: torch.Tensor | None = None
 
 
 class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
@@ -116,7 +116,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
         query_start_loc_cpu: torch.Tensor,
         query_start_loc_device: torch.Tensor,
         num_decode_tokens: int,
-        dcp_tot_seq_lens_device: Optional[torch.Tensor],
+        dcp_tot_seq_lens_device: torch.Tensor | None,
     ) -> AiterMLADecodeMetadata:
         page_size = self.kv_cache_spec.block_size
         block_table_bounds = (seq_lens_device + page_size - 1) // page_size
@@ -188,12 +188,12 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float],
+        logits_soft_cap: float | None,
         attn_type: str,
-        kv_sharing_target_layer_name: Optional[str],
+        kv_sharing_target_layer_name: str | None,
         # MLA Specific Arguments
         **mla_args,
     ) -> None:
@@ -242,11 +242,11 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]):
 
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: AiterMLAMetadata,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 3b6718c48d09a..e2df0179d99a8 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union
 
 import torch
 
@@ -44,12 +43,12 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float],
+        logits_soft_cap: float | None,
         attn_type: str,
-        kv_sharing_target_layer_name: Optional[str],
+        kv_sharing_target_layer_name: str | None,
         # MLA Specific Arguments
         **mla_args,
     ) -> None:
@@ -138,11 +137,11 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
 
     def _forward_decode(
         self,
-        q: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         layer: AttentionLayer,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 1622f852a9522..28085cb1424b4 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -201,12 +200,12 @@ class PallasAttentionBackendImpl(AttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
+        kv_sharing_target_layer_name: int | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -242,9 +241,9 @@ class PallasAttentionBackendImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: PallasMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with Pallas attention.
 
@@ -342,7 +341,7 @@ def write_to_kv_cache(
     slot_mapping: torch.Tensor,
     num_slices_per_kv_cache_update_block: int,
     num_kv_update_slices: torch.Tensor,
-    kv_cache_quantized_dtype: Optional[torch.dtype] = None,
+    kv_cache_quantized_dtype: torch.dtype | None = None,
     k_scale: float = 1.0,
     v_scale: float = 1.0,
 ) -> None:
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 82505f6281c0a..cce43b220da77 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -3,7 +3,6 @@
 """Attention layer with AiterFlashAttention."""
 
 from dataclasses import dataclass
-from typing import Optional, Union
 
 import torch
 
@@ -160,8 +159,8 @@ if current_platform.is_rocm():
         max_seqlen_q: int,
         max_seqlen_k: int,
         softmax_scale: float,
-        window_size: Optional[list[int]],  # -1 means infinite context window
-        alibi_slopes: Optional[list[float]],
+        window_size: list[int] | None,  # -1 means infinite context window
+        alibi_slopes: list[float] | None,
         block_table: torch.Tensor,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
@@ -209,8 +208,8 @@ if current_platform.is_rocm():
         max_seqlen_q: int,
         max_seqlen_k: int,
         softmax_scale: float,
-        window_size: Optional[list[int]],  # -1 means infinite context window
-        alibi_slopes: Optional[list[float]],
+        window_size: list[int] | None,  # -1 means infinite context window
+        alibi_slopes: list[float] | None,
         block_table: torch.Tensor,
         k_scale: torch.Tensor,
         v_scale: torch.Tensor,
@@ -249,7 +248,7 @@ class AiterFlashAttentionMetadata:
     seq_lens: torch.Tensor
     slot_mapping: torch.Tensor
     block_table: torch.Tensor
-    cu_seq_lens: Optional[torch.Tensor]
+    cu_seq_lens: torch.Tensor | None
 
     # For cascade attention.
     use_cascade: bool
@@ -283,7 +282,7 @@ class AiterFlashAttentionMetadataBuilder(
         self.block_size = kv_cache_spec.block_size
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
-        self.aot_sliding_window: Optional[tuple[int, int]] = None
+        self.aot_sliding_window: tuple[int, int] | None = None
         self.total_tokens: int = 0
 
     def build_for_cudagraph_capture(
@@ -361,7 +360,7 @@ class AiterFlashAttentionBackend(AttentionBackend):
         return [64, 128, 256]
 
     @staticmethod
-    def get_supported_kernel_block_size() -> list[Union[int, MultipleOf]]:
+    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
         return [MultipleOf(16)]
 
     @classmethod
@@ -412,12 +411,12 @@ class AiterFlashAttentionImpl(AttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
+        kv_sharing_target_layer_name: int | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -458,9 +457,9 @@ class AiterFlashAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: AiterFlashAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with AiterFlashAttention.
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
index 235ea1c376ef4..14184944934fa 100644
--- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
+++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with PagedAttention and Triton prefix prefill."""
 
-from typing import Optional
-
 import torch
 
 from vllm import _custom_ops as ops
@@ -70,13 +68,13 @@ class RocmAiterUnifiedAttentionImpl(RocmAttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
-        sinks: Optional[torch.Tensor] = None,
+        kv_sharing_target_layer_name: int | None = None,
+        sinks: torch.Tensor | None = None,
     ) -> None:
         super().__init__(
             num_heads,
@@ -106,9 +104,9 @@ class RocmAiterUnifiedAttentionImpl(RocmAttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 10dd01f0a5aa4..5245c7f449259 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -3,7 +3,7 @@
 """Attention layer with PagedAttention and Triton prefix prefill."""
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import ClassVar
 
 import torch
 
@@ -54,13 +54,13 @@ class RocmAttentionMetadata:
     # For cascade attention.
     use_cascade: bool
     common_prefix_len: int
-    cu_prefix_query_lens: Optional[torch.Tensor]
-    prefix_kv_lens: Optional[torch.Tensor]
-    suffix_kv_lens: Optional[torch.Tensor]
+    cu_prefix_query_lens: torch.Tensor | None
+    prefix_kv_lens: torch.Tensor | None
+    suffix_kv_lens: torch.Tensor | None
 
     # Optional aot scheduling
-    scheduler_metadata: Optional[torch.Tensor] = None
-    prefix_scheduler_metadata: Optional[torch.Tensor] = None
+    scheduler_metadata: torch.Tensor | None = None
+    prefix_scheduler_metadata: torch.Tensor | None = None
 
 
 class RocmAttentionMetadataBuilder(AttentionMetadataBuilder[RocmAttentionMetadata]):
@@ -217,13 +217,13 @@ class RocmAttentionImpl(AttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
-        sinks: Optional[torch.Tensor] = None,
+        kv_sharing_target_layer_name: int | None = None,
+        sinks: torch.Tensor | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -273,9 +273,9 @@ class RocmAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: FlashAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with FlashAttention.
 
diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py
index 74cfecca764e6..22ad1054b35e1 100644
--- a/vllm/v1/attention/backends/short_conv_attn.py
+++ b/vllm/v1/attention/backends/short_conv_attn.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -30,12 +29,12 @@ class ShortConvAttentionMetadata:
 
     query_start_loc: torch.Tensor
     state_indices_tensor: torch.Tensor
-    has_initial_states_p: Optional[torch.Tensor]
+    has_initial_states_p: torch.Tensor | None
 
     # For causal_conv1d
-    nums_dict: Optional[dict] = None
-    batch_ptr: Optional[torch.Tensor] = None
-    token_chunk_offset_ptr: Optional[torch.Tensor] = None
+    nums_dict: dict | None = None
+    batch_ptr: torch.Tensor | None = None
+    token_chunk_offset_ptr: torch.Tensor | None = None
 
 
 class ShortConvAttentionMetadataBuilder(
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index 669dbe31810b6..aab90cfd1fe0d 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -4,7 +4,7 @@
 
 import ast
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import Optional
 
 import torch
 
@@ -41,7 +41,7 @@ class TreeAttentionBackend(AttentionBackend):
         return [32, 64, 96, 128, 160, 192, 224, 256]
 
     @staticmethod
-    def get_supported_kernel_block_size() -> list[Union[int, MultipleOf]]:
+    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
         return [MultipleOf(16)]
 
     @classmethod
@@ -104,7 +104,7 @@ class TreeAttentionMetadata:
     num_prefills: int = 0
     num_decodes: int = 0
 
-    tree_attn_bias: Optional[torch.Tensor] = None
+    tree_attn_bias: torch.Tensor | None = None
 
     # Cached Prefill/decode metadata.
     _cached_prefill_metadata: Optional["TreeAttentionMetadata"] = None
@@ -267,8 +267,8 @@ def _get_depth_counts(sorted_tree_choices: list[tuple[int, ...]]) -> list[int]:
 def _prepare_tree_attn_bias(
     sorted_tree_choices: list[tuple[int, ...]],
     depth_counts: list[int],
-    dtype: Optional[torch.dtype],
-    device: Optional[torch.device],
+    dtype: torch.dtype | None,
+    device: torch.device | None,
 ) -> torch.Tensor:
     # +1 comes from the additional root node.
     tree_len = len(sorted_tree_choices) + 1
@@ -310,12 +310,12 @@ class TreeAttentionImpl(AttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
+        kv_sharing_target_layer_name: str | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -354,9 +354,9 @@ class TreeAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: TreeAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with TreeAttention.
 
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 878634c7f521d..9d1d007a08e4c 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -3,7 +3,7 @@
 """High-Performance Triton-only Attention layer."""
 
 from dataclasses import dataclass
-from typing import ClassVar, Optional, Union
+from typing import ClassVar
 
 import torch
 
@@ -61,13 +61,13 @@ class TritonAttentionMetadata:
     # For cascade attention.
     use_cascade: bool
     common_prefix_len: int
-    cu_prefix_query_lens: Optional[torch.Tensor]
-    prefix_kv_lens: Optional[torch.Tensor]
-    suffix_kv_lens: Optional[torch.Tensor]
+    cu_prefix_query_lens: torch.Tensor | None
+    prefix_kv_lens: torch.Tensor | None
+    suffix_kv_lens: torch.Tensor | None
 
     # Optional aot scheduling
-    scheduler_metadata: Optional[torch.Tensor] = None
-    prefix_scheduler_metadata: Optional[torch.Tensor] = None
+    scheduler_metadata: torch.Tensor | None = None
+    prefix_scheduler_metadata: torch.Tensor | None = None
 
 
 class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMetadata]):
@@ -159,7 +159,7 @@ class TritonAttentionBackend(AttentionBackend):
         return [torch.float16, torch.bfloat16, torch.float32]
 
     @staticmethod
-    def get_supported_kernel_block_size() -> list[Union[int, MultipleOf]]:
+    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
         return [MultipleOf(16)]
 
     @classmethod
@@ -216,13 +216,13 @@ class TritonAttentionImpl(AttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[int] = None,
-        sinks: Optional[torch.Tensor] = None,
+        kv_sharing_target_layer_name: int | None = None,
+        sinks: torch.Tensor | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -272,9 +272,9 @@ class TritonAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: TritonAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with Paged Attention impl. in Triton.
 
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 7c6940d9b15d5..beb267f196fb9 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -11,10 +11,8 @@ from typing import (
     ClassVar,
     Generic,
     Literal,
-    Optional,
     Protocol,
     TypeVar,
-    Union,
     get_args,
 )
 
@@ -42,7 +40,7 @@ from vllm.v1.worker.ubatch_utils import UBatchSlice
 
 logger = init_logger(__name__)
 KVCacheLayoutType = Literal["NHD", "HND"]
-_KV_CACHE_LAYOUT_OVERRIDE: Union[KVCacheLayoutType, None] = None
+_KV_CACHE_LAYOUT_OVERRIDE: KVCacheLayoutType | None = None
 
 PAD_SLOT_ID = -1
 
@@ -87,13 +85,13 @@ class CommonAttentionMetadata:
     causal: bool = True
 
     # Needed by FastPrefillAttentionBuilder
-    logits_indices_padded: Optional[torch.Tensor] = None
-    num_logits_indices: Optional[int] = None
+    logits_indices_padded: torch.Tensor | None = None
+    num_logits_indices: int | None = None
 
     # Needed by CrossAttentionBuilder
-    encoder_seq_lens: Optional[np.ndarray] = None
+    encoder_seq_lens: np.ndarray | None = None
 
-    dcp_local_seq_lens: Optional[torch.Tensor] = None
+    dcp_local_seq_lens: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
 
@@ -250,7 +248,7 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     # Does this backend/builder reorder the batch?
     # If not, set this to None. Otherwise set it to the query
     # length that will be pulled into the front of the batch.
-    reorder_batch_threshold: Optional[int] = None
+    reorder_batch_threshold: int | None = None
 
     @abstractmethod
     def __init__(
@@ -395,12 +393,12 @@ class PerLayerParameters:
     """
 
     window_left: int
-    logits_soft_cap: Optional[float]
+    logits_soft_cap: float | None
     sm_scale: float
     has_sinks: bool = False
     # has same params for all layers
-    has_same_window_lefts: Optional[bool] = field(default=None, compare=False)
-    has_same_all_params: Optional[bool] = field(default=None, compare=False)
+    has_same_window_lefts: bool | None = field(default=None, compare=False)
+    has_same_all_params: bool | None = field(default=None, compare=False)
 
 
 def get_per_layer_parameters(
@@ -873,7 +871,7 @@ def reshape_attn_output_for_spec_decode(attn_output: torch.Tensor) -> torch.Tens
 
 
 KV_SHARING_FAST_PREFILL_METADATA_FIELDS = [
-    ("logits_indices_padded", Optional[torch.Tensor], None),
+    ("logits_indices_padded", torch.Tensor | None, None),
     ("num_logits_indices", int, 0),
 ]
 
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index eb1fcc2c024d2..41c543c18adcc 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -3,7 +3,7 @@
 """Attention layer with XFormersAttention."""
 
 from dataclasses import dataclass
-from typing import Optional, Union
+from typing import Optional
 
 import torch
 
@@ -82,7 +82,7 @@ class XFormersAttentionBackend(AttentionBackend):
         ]
 
     @staticmethod
-    def get_supported_kernel_block_size() -> list[Union[int, MultipleOf]]:
+    def get_supported_kernel_block_size() -> list[int | MultipleOf]:
         return [MultipleOf(16)]
 
     @classmethod
@@ -280,12 +280,12 @@ class XFormersAttentionImpl(AttentionImpl):
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
         kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
+        logits_soft_cap: float | None = None,
         attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
+        kv_sharing_target_layer_name: str | None = None,
     ) -> None:
         if kv_sharing_target_layer_name is not None:
             raise NotImplementedError("KV sharing is not supported in V0.")
@@ -328,9 +328,9 @@ class XFormersAttentionImpl(AttentionImpl):
         value: torch.Tensor,
         kv_cache: torch.Tensor,
         attn_metadata: XFormersAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """Forward pass with XFormers.
 
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index ddfd94322737f..cd22db410a6e2 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
-from typing import Any, Optional, Union
+from typing import Any
 
 from vllm.distributed.kv_events import (
     MEDIUM_GPU,
@@ -51,10 +51,10 @@ class BlockHashToBlockMap:
 
     def __init__(self):
         self._cache: dict[
-            BlockHashWithGroupId, Union[KVCacheBlock, dict[int, KVCacheBlock]]
+            BlockHashWithGroupId, KVCacheBlock | dict[int, KVCacheBlock]
         ] = {}
 
-    def get_one_block(self, key: BlockHashWithGroupId) -> Optional[KVCacheBlock]:
+    def get_one_block(self, key: BlockHashWithGroupId) -> KVCacheBlock | None:
         """
         Gets any block with the given block hash key.
         """
@@ -85,7 +85,7 @@ class BlockHashToBlockMap:
         else:
             self._unexpected_blocks_type(blocks)
 
-    def pop(self, key: BlockHashWithGroupId, block_id: int) -> Optional[KVCacheBlock]:
+    def pop(self, key: BlockHashWithGroupId, block_id: int) -> KVCacheBlock | None:
         """
         Checks if block_hash exists and pop block_id from the cache
         """
@@ -168,7 +168,7 @@ class BlockPool:
 
     def get_cached_block(
         self, block_hash: BlockHash, kv_cache_group_ids: list[int]
-    ) -> Optional[list[KVCacheBlock]]:
+    ) -> list[KVCacheBlock] | None:
         """Get the cached block by the block hash for each group in
         `kv_cache_group_ids`, or None if cache miss for any group.
         If there are duplicated blocks, we return the first block in the cache.
@@ -225,7 +225,7 @@ class BlockPool:
         assert len(request.block_hashes) >= num_full_blocks
         new_block_hashes = request.block_hashes[num_cached_blocks:]
 
-        new_hashes: Optional[list[ExternalBlockHash]] = (
+        new_hashes: list[ExternalBlockHash] | None = (
             [] if self.enable_kv_cache_events else None
         )
         for i, blk in enumerate(new_full_blocks):
@@ -243,7 +243,7 @@ class BlockPool:
 
         if self.enable_kv_cache_events:
             if num_cached_blocks == 0:
-                parent_block_hash: Optional[ExternalBlockHash] = None
+                parent_block_hash: ExternalBlockHash | None = None
             else:
                 parent_block = blocks[num_cached_blocks - 1]
                 assert parent_block.block_hash is not None
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index ef6da9adeea70..ece382277255f 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
-from typing import Optional
 
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
@@ -320,8 +319,8 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
         one of them is full attention. Then, split the kv cache groups into full
         attention groups and other groups.
         """
-        full_attention_spec: Optional[FullAttentionSpec] = None
-        other_spec: Optional[KVCacheSpec] = None
+        full_attention_spec: FullAttentionSpec | None = None
+        other_spec: KVCacheSpec | None = None
         self.full_attention_group_ids: list[int] = []
         self.other_group_ids: list[int] = []
         for i, g in enumerate(self.kv_cache_config.kv_cache_groups):
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index b74ccd30b97b3..7a1025fc2bb4f 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Literal, Optional, overload
+from typing import Literal, overload
 
 from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
@@ -49,12 +49,12 @@ class KVCacheBlocks:
     def get_block_ids(
         self,
         allow_none: Literal[True] = True,
-    ) -> Optional[tuple[list[int], ...]]: ...
+    ) -> tuple[list[int], ...] | None: ...
 
     def get_block_ids(
         self,
         allow_none: bool = False,
-    ) -> Optional[tuple[list[int], ...]]:
+    ) -> tuple[list[int], ...] | None:
         """
         Converts the KVCacheBlocks instance to block_ids.
 
@@ -97,7 +97,7 @@ class KVCacheManager:
         # FIXME: make prefix cache stats conditional on log_stats
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
 
-        self.block_size: Optional[int] = None
+        self.block_size: int | None = None
         if self.enable_caching:
             assert (
                 len(
@@ -140,7 +140,7 @@ class KVCacheManager:
         """
         return self.block_pool.get_usage()
 
-    def make_prefix_cache_stats(self) -> Optional[PrefixCacheStats]:
+    def make_prefix_cache_stats(self) -> PrefixCacheStats | None:
         """Get (and reset) the prefix cache stats.
 
         Returns:
@@ -205,11 +205,11 @@ class KVCacheManager:
         request: Request,
         num_new_tokens: int,
         num_new_computed_tokens: int = 0,
-        new_computed_blocks: Optional[KVCacheBlocks] = None,
+        new_computed_blocks: KVCacheBlocks | None = None,
         num_lookahead_tokens: int = 0,
         delay_cache_blocks: bool = False,
         num_encoder_tokens: int = 0,
-    ) -> Optional[KVCacheBlocks]:
+    ) -> KVCacheBlocks | None:
         """Add slots for a request with new tokens to append.
 
         Args:
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 7a602b9936855..6c9a77ccb2b6a 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -5,9 +5,9 @@
 import copy
 import os
 from collections import defaultdict
-from collections.abc import Iterable, Sequence
+from collections.abc import Callable, Iterable, Sequence
 from dataclasses import dataclass
-from typing import Any, Callable, NewType, Optional, Union
+from typing import Any, NewType, TypeAlias
 
 from vllm import envs
 from vllm.config import VllmConfig
@@ -38,7 +38,7 @@ BlockHashWithGroupId = NewType("BlockHashWithGroupId", bytes)
 # ExternalBlockHash is used for reproducible prefix-cache block hashing.
 # It's a union of ``bytes`` and ``int`` to keep backward compatibility
 # after we default block hashing to use sha256 bytes.
-ExternalBlockHash = Union[bytes, int]
+ExternalBlockHash: TypeAlias = bytes | int
 
 
 def make_block_hash_with_group_id(
@@ -110,18 +110,18 @@ class KVCacheBlock:
     ref_cnt: int = 0
     # The hash key (block hash + group id) of the block, only available
     # when the block is full and cached.
-    _block_hash: Optional[BlockHashWithGroupId] = None
+    _block_hash: BlockHashWithGroupId | None = None
 
     # Used to construct a doubly linked list for free blocks.
     # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
-    prev_free_block: Optional["KVCacheBlock"] = None
-    next_free_block: Optional["KVCacheBlock"] = None
+    prev_free_block: "KVCacheBlock | None" = None
+    next_free_block: "KVCacheBlock | None" = None
 
     # Whether the block is a null block that should never be cached.
     is_null: bool = False
 
     @property
-    def block_hash(self) -> Optional[BlockHashWithGroupId]:
+    def block_hash(self) -> BlockHashWithGroupId | None:
         return self._block_hash
 
     @block_hash.setter
@@ -461,7 +461,7 @@ def _gen_lora_extra_hash_keys(request: Request) -> list[int]:
 
 def generate_block_hash_extra_keys(
     request: Request, start_token_idx: int, end_token_idx: int, start_mm_idx: int
-) -> tuple[Optional[tuple[Any, ...]], int]:
+) -> tuple[tuple[Any, ...] | None, int]:
     """Generate extra keys for the block hash. The extra keys can come from
     the multi-modal inputs and request specific metadata (e.g., LoRA ID).
 
@@ -493,9 +493,9 @@ def generate_block_hash_extra_keys(
 
 def hash_block_tokens(
     hash_function: Callable[[Any], bytes],
-    parent_block_hash: Optional[BlockHash],
+    parent_block_hash: BlockHash | None,
     curr_block_token_ids: Sequence[int],
-    extra_keys: Optional[tuple[Any, ...]] = None,
+    extra_keys: tuple[Any, ...] | None = None,
 ) -> BlockHash:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
index 968b4db530bfe..da6e4aa2996bb 100644
--- a/vllm/v1/core/sched/async_scheduler.py
+++ b/vllm/v1/core/sched/async_scheduler.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index b92ef395e9b71..c36483203343d 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Optional
 
 if TYPE_CHECKING:
     from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
@@ -80,7 +80,7 @@ class SchedulerInterface(ABC):
     @abstractmethod
     def finish_requests(
         self,
-        request_ids: Union[str, Iterable[str]],
+        request_ids: str | Iterable[str],
         finished_status: "RequestStatus",
     ) -> None:
         """Finish the requests in the scheduler's internal queue. If the request
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 981c5e9c76361..bce15e1a476fd 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
@@ -19,6 +17,13 @@ if TYPE_CHECKING:
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
     from vllm.v1.request import Request
+else:
+    KVConnectorMetadata = object
+    LoRARequest = object
+    MultiModalFeatureSpec = object
+    PoolingParams = object
+    SamplingParams = object
+    Request = object
 
 
 @bc_linter_include
@@ -32,14 +37,14 @@ class NewRequestData:
     block_ids: tuple[list[int], ...]
     num_computed_tokens: int
     lora_request: LoRARequest | None
-    prompt_embeds: torch.Tensor | None = None
+    prompt_embeds: "torch.Tensor | None" = None
 
     @classmethod
     def from_request(
         cls,
         request: Request,
         block_ids: tuple[list[int], ...],
-    ) -> NewRequestData:
+    ) -> "NewRequestData":
         return cls(
             req_id=request.request_id,
             prompt_token_ids=request.prompt_token_ids,
@@ -110,7 +115,7 @@ class CachedRequestData:
         return len(self.req_ids)
 
     @classmethod
-    def make_empty(cls) -> CachedRequestData:
+    def make_empty(cls) -> "CachedRequestData":
         return cls(
             req_ids=[],
             resumed_from_preemption=[],
@@ -164,7 +169,7 @@ class SchedulerOutput:
     # for filling the next token bitmask
     structured_output_request_ids: dict[str, int]
     # the bitmask for the whole batch
-    grammar_bitmask: npt.NDArray[np.int32] | None
+    grammar_bitmask: "npt.NDArray[np.int32] | None"
 
     # KV Cache Connector metadata.
     kv_connector_metadata: KVConnectorMetadata | None = None
diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py
index 33e5ec72ebd78..7bc1010db23a2 100644
--- a/vllm/v1/core/sched/request_queue.py
+++ b/vllm/v1/core/sched/request_queue.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import heapq
 from abc import ABC, abstractmethod
 from collections import deque
@@ -43,7 +41,7 @@ class RequestQueue(ABC):
         pass
 
     @abstractmethod
-    def prepend_requests(self, requests: RequestQueue) -> None:
+    def prepend_requests(self, requests: "RequestQueue") -> None:
         """Prepend all requests from another queue to the front of this
         queue."""
         pass
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 0f1504724d7c6..32c2eb8a46526 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import itertools
 import time
 from collections import defaultdict
 from collections.abc import Iterable
-from typing import Any, Union
+from typing import Any
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch
@@ -1168,7 +1166,7 @@ class Scheduler(SchedulerInterface):
 
     def finish_requests(
         self,
-        request_ids: Union[str, Iterable[str]],
+        request_ids: str | Iterable[str],
         finished_status: RequestStatus,
     ) -> None:
         """Handles the finish signal from outside the scheduler.
diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py
index 5906a73382a2d..8af8a7d278064 100644
--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
-from typing import Optional
 
 import torch
 
@@ -41,7 +40,7 @@ def remove_all(lst: list, items_to_remove: set) -> list:
 
 
 def check_stop(
-    request: Request, max_model_len: int, pooler_output: Optional[torch.Tensor] = None
+    request: Request, max_model_len: int, pooler_output: torch.Tensor | None = None
 ) -> bool:
     if (
         request.num_tokens >= max_model_len
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index ce47147028696..9f071a0ddac22 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.forward_context import BatchDescriptor
@@ -102,7 +101,7 @@ class CudagraphDispatcher:
 
     def dispatch(
         self, batch_descriptor: BatchDescriptor, use_cascade_attn: bool = False
-    ) -> tuple[CUDAGraphMode, Optional[BatchDescriptor]]:
+    ) -> tuple[CUDAGraphMode, BatchDescriptor | None]:
         """
         Given conditions(e.g.,batch descriptor and if using cascade attention),
         dispatch to a cudagraph runtime mode and the valid batch descriptor.
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 163c050e559e0..e2c1ed7b561c7 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -4,7 +4,7 @@
 import enum
 import time
 from collections.abc import Mapping
-from typing import Any, Optional, Union
+from typing import Any
 
 import msgspec
 import torch
@@ -48,16 +48,16 @@ class EngineCoreRequest(
     gc=False,
 ):  # type: ignore[call-arg]
     request_id: str
-    prompt_token_ids: Optional[list[int]]
-    mm_features: Optional[list[MultiModalFeatureSpec]]
-    sampling_params: Optional[SamplingParams]
-    pooling_params: Optional[PoolingParams]
-    eos_token_id: Optional[int]
+    prompt_token_ids: list[int] | None
+    mm_features: list[MultiModalFeatureSpec] | None
+    sampling_params: SamplingParams | None
+    pooling_params: PoolingParams | None
+    eos_token_id: int | None
     arrival_time: float
-    lora_request: Optional[LoRARequest]
-    cache_salt: Optional[str]
-    data_parallel_rank: Optional[int]
-    prompt_embeds: Optional[torch.Tensor] = None
+    lora_request: LoRARequest | None
+    cache_salt: str | None
+    data_parallel_rank: int | None
+    prompt_embeds: torch.Tensor | None = None
 
     # Index of the client, used to ensure outputs are sent back to the same
     # client for this request when scaling out the front-end.
@@ -69,7 +69,7 @@ class EngineCoreRequest(
     current_wave: int = 0
     priority: int = 0
 
-    trace_headers: Optional[Mapping[str, str]] = None
+    trace_headers: Mapping[str, str] | None = None
 
 
 class EngineCoreEventType(enum.IntEnum):
@@ -93,7 +93,7 @@ class EngineCoreEvent(msgspec.Struct):
 
     @classmethod
     def new_event(
-        cls, event_type: EngineCoreEventType, timestamp: Optional[float] = None
+        cls, event_type: EngineCoreEventType, timestamp: float | None = None
     ) -> "EngineCoreEvent":
         timestamp = time.monotonic() if timestamp is None else timestamp
         return cls(event_type, timestamp)
@@ -108,17 +108,17 @@ class EngineCoreOutput(
     request_id: str
     new_token_ids: list[int]
 
-    new_logprobs: Optional[LogprobsLists] = None
-    new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None
+    new_logprobs: LogprobsLists | None = None
+    new_prompt_logprobs_tensors: LogprobsTensors | None = None
 
-    pooling_output: Optional[torch.Tensor] = None
+    pooling_output: torch.Tensor | None = None
 
-    finish_reason: Optional[FinishReason] = None
-    stop_reason: Union[int, str, None] = None
-    events: Optional[list[EngineCoreEvent]] = None
-    kv_transfer_params: Optional[dict[str, Any]] = None
+    finish_reason: FinishReason | None = None
+    stop_reason: int | str | None = None
+    events: list[EngineCoreEvent] | None = None
+    kv_transfer_params: dict[str, Any] | None = None
 
-    trace_headers: Optional[Mapping[str, str]] = None
+    trace_headers: Mapping[str, str] | None = None
     # The number of tokens with prefix cache hits.
     num_cached_tokens: int = 0
 
@@ -142,8 +142,8 @@ class UtilityOutput(
     call_id: int
 
     # Non-None implies the call failed, result should be None.
-    failure_message: Optional[str] = None
-    result: Optional[UtilityResult] = None
+    failure_message: str | None = None
+    result: UtilityResult | None = None
 
 
 class EngineCoreOutputs(
@@ -159,18 +159,18 @@ class EngineCoreOutputs(
 
     # [num_reqs]
     outputs: list[EngineCoreOutput] = []
-    scheduler_stats: Optional[SchedulerStats] = None
+    scheduler_stats: SchedulerStats | None = None
     timestamp: float = 0.0
 
-    utility_output: Optional[UtilityOutput] = None
-    finished_requests: Optional[set[str]] = None
+    utility_output: UtilityOutput | None = None
+    finished_requests: set[str] | None = None
 
     # In DP case, used to signal that the current wave of requests
     # has finished and the engines are paused.
-    wave_complete: Optional[int] = None
+    wave_complete: int | None = None
     # In DP case, used to signal that a request was received for an
     # "old" wave, so the next wave needs to be started in other engines.
-    start_wave: Optional[int] = None
+    start_wave: int | None = None
 
     def __post_init__(self):
         if self.timestamp == 0.0:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 112ec92b3af8e..fbbe15b7b04f2 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -6,7 +6,7 @@ import socket
 import time
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
-from typing import Any, Optional, Union
+from typing import Any
 
 import numpy as np
 import torch
@@ -56,8 +56,8 @@ class AsyncLLM(EngineClient):
         use_cached_outputs: bool = False,
         log_requests: bool = True,
         start_engine_loop: bool = True,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
-        client_addresses: Optional[dict[str, str]] = None,
+        stat_loggers: list[StatLoggerFactory] | None = None,
+        client_addresses: dict[str, str] | None = None,
         client_count: int = 1,
         client_index: int = 0,
     ) -> None:
@@ -136,7 +136,7 @@ class AsyncLLM(EngineClient):
         )
 
         # Loggers.
-        self.logger_manager: Optional[StatLoggerManager] = None
+        self.logger_manager: StatLoggerManager | None = None
         if self.log_stats:
             self.logger_manager = StatLoggerManager(
                 vllm_config=vllm_config,
@@ -147,7 +147,7 @@ class AsyncLLM(EngineClient):
             )
             self.logger_manager.log_engine_initialized()
 
-        self.output_handler: Optional[asyncio.Task] = None
+        self.output_handler: asyncio.Task | None = None
         try:
             # Start output handler eagerly if we are in the asyncio eventloop.
             asyncio.get_running_loop()
@@ -185,10 +185,10 @@ class AsyncLLM(EngineClient):
         vllm_config: VllmConfig,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        stat_loggers: list[StatLoggerFactory] | None = None,
         enable_log_requests: bool = False,
         disable_log_stats: bool = False,
-        client_addresses: Optional[dict[str, str]] = None,
+        client_addresses: dict[str, str] | None = None,
         client_count: int = 1,
         client_index: int = 0,
         disable_log_requests: bool = True,  # Deprecated, will be removed
@@ -221,7 +221,7 @@ class AsyncLLM(EngineClient):
         engine_args: AsyncEngineArgs,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        stat_loggers: list[StatLoggerFactory] | None = None,
     ) -> "AsyncLLM":
         """Create an AsyncLLM from the EngineArgs."""
 
@@ -259,15 +259,15 @@ class AsyncLLM(EngineClient):
     async def add_request(
         self,
         request_id: str,
-        prompt: Union[EngineCoreRequest, PromptType],
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt: EngineCoreRequest | PromptType,
+        params: SamplingParams | PoolingParams,
+        arrival_time: float | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-        prompt_text: Optional[str] = None,
+        data_parallel_rank: int | None = None,
+        prompt_text: str | None = None,
     ) -> RequestOutputCollector:
         """Add new request to the AsyncLLM."""
 
@@ -325,8 +325,8 @@ class AsyncLLM(EngineClient):
     async def _add_request(
         self,
         request: EngineCoreRequest,
-        prompt: Optional[str],
-        parent_req: Optional[ParentRequest],
+        prompt: str | None,
+        parent_req: ParentRequest | None,
         index: int,
         queue: RequestOutputCollector,
     ):
@@ -346,16 +346,16 @@ class AsyncLLM(EngineClient):
     # re-multiplexed in the API server anyhow.
     async def generate(
         self,
-        prompt: Union[EngineCoreRequest, PromptType],
+        prompt: EngineCoreRequest | PromptType,
         sampling_params: SamplingParams,
         request_id: str,
         *,
-        prompt_text: Optional[str] = None,
-        lora_request: Optional[LoRARequest] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_text: str | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
+        data_parallel_rank: int | None = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         """
         Main function called by the API server to kick off a request
@@ -520,7 +520,7 @@ class AsyncLLM(EngineClient):
 
         self.output_handler = asyncio.create_task(output_handler())
 
-    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
+    async def abort(self, request_id: str | Iterable[str]) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
 
         request_ids = (
@@ -537,11 +537,11 @@ class AsyncLLM(EngineClient):
         prompt: PromptType,
         pooling_params: PoolingParams,
         request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        lora_request: LoRARequest | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
-        truncate_prompt_tokens: Optional[int] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        truncate_prompt_tokens: int | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """
         Main function called by the API server to kick off a request
@@ -622,11 +622,11 @@ class AsyncLLM(EngineClient):
             raise EngineGenerateError() from e
 
     @property
-    def tokenizer(self) -> Optional[AnyTokenizer]:
+    def tokenizer(self) -> AnyTokenizer | None:
         return self.processor.tokenizer
 
     @tokenizer.setter
-    def tokenizer(self, tokenizer: Optional[AnyTokenizer]) -> None:
+    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
         self.processor.tokenizer = tokenizer
 
     async def get_tokenizer(self) -> AnyTokenizer:
@@ -665,7 +665,7 @@ class AsyncLLM(EngineClient):
         self.processor.clear_mm_cache()
         await self.engine_core.reset_mm_cache_async()
 
-    async def reset_prefix_cache(self, device: Optional[Device] = None) -> None:
+    async def reset_prefix_cache(self, device: Device | None = None) -> None:
         if device == Device.CPU:
             raise ValueError("Not supported on CPU.")
         await self.engine_core.reset_prefix_cache_async()
@@ -674,7 +674,7 @@ class AsyncLLM(EngineClient):
         await self.reset_prefix_cache()
         await self.engine_core.sleep_async(level)
 
-    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    async def wake_up(self, tags: list[str] | None = None) -> None:
         await self.engine_core.wake_up_async(tags)
 
     async def is_sleeping(self) -> bool:
@@ -699,9 +699,9 @@ class AsyncLLM(EngineClient):
     async def collective_rpc(
         self,
         method: str,
-        timeout: Optional[float] = None,
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict] = None,
+        kwargs: dict | None = None,
     ):
         """
         Perform a collective RPC call to the given path.
diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py
index 9bb08e6db7bec..7a27e2fe2c3c0 100644
--- a/vllm/v1/engine/coordinator.py
+++ b/vllm/v1/engine/coordinator.py
@@ -4,7 +4,6 @@ import copy
 import multiprocessing
 import time
 import weakref
-from typing import Optional
 
 import msgspec.msgpack
 import zmq
@@ -155,7 +154,7 @@ class DPCoordinatorProc:
         stats_changed = False
         last_stats_step = -1
         last_stats_wave = -1
-        last_step_counts: Optional[list[list[int]]] = None
+        last_step_counts: list[list[int]] | None = None
 
         with (
             make_zmq_socket(
@@ -360,7 +359,7 @@ class DPCoordinatorProc:
 
     @staticmethod
     def _send_start_wave(
-        socket: zmq.Socket, wave: int, exclude_engine_index: Optional[int]
+        socket: zmq.Socket, wave: int, exclude_engine_index: int | None
     ):
         """Broadcast the START_DP_WAVE message to all the engines.
         It includes the current wave number and index of engine which
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e6474d91ffedb..9aa4d459e2104 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -7,12 +7,12 @@ import signal
 import threading
 import time
 from collections import deque
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from concurrent.futures import Future
 from contextlib import ExitStack, contextmanager
 from inspect import isclass, signature
 from logging import DEBUG
-from typing import Any, Callable, Optional, TypeVar, Union
+from typing import Any, TypeVar
 
 import msgspec
 import zmq
@@ -83,7 +83,7 @@ class EngineCore:
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
-        executor_fail_callback: Optional[Callable] = None,
+        executor_fail_callback: Callable | None = None,
     ):
         # plugins need to be loaded at the engine/scheduler level too
         from vllm.plugins import load_general_plugins
@@ -171,14 +171,14 @@ class EngineCore:
         # schedule and execute batches, and is required by pipeline parallelism
         # to eliminate pipeline bubbles.
         self.batch_queue_size = self.model_executor.max_concurrent_batches
-        self.batch_queue: Optional[
-            deque[tuple[Future[ModelRunnerOutput], SchedulerOutput]]
-        ] = None
+        self.batch_queue: (
+            deque[tuple[Future[ModelRunnerOutput], SchedulerOutput]] | None
+        ) = None
         if self.batch_queue_size > 1:
             logger.info("Batch queue is enabled with size %d", self.batch_queue_size)
             self.batch_queue = deque(maxlen=self.batch_queue_size)
 
-        self.request_block_hasher: Optional[Callable[[Request], list[BlockHash]]] = None
+        self.request_block_hasher: Callable[[Request], list[BlockHash]] | None = None
         if (
             self.vllm_config.cache_config.enable_prefix_caching
             or self.scheduler.get_kv_connector() is not None
@@ -337,7 +337,7 @@ class EngineCore:
 
     def step_with_batch_queue(
         self,
-    ) -> tuple[Optional[dict[int, EngineCoreOutputs]], bool]:
+    ) -> tuple[dict[int, EngineCoreOutputs] | None, bool]:
         """Schedule and execute batches with the batch queue.
         Note that if nothing to output in this step, None is returned.
 
@@ -424,7 +424,7 @@ class EngineCore:
     def sleep(self, level: int = 1):
         self.model_executor.sleep(level)
 
-    def wake_up(self, tags: Optional[list[str]] = None):
+    def wake_up(self, tags: list[str] | None = None):
         self.model_executor.wake_up(tags)
 
     def is_sleeping(self) -> bool:
@@ -448,8 +448,8 @@ class EngineCore:
     def save_sharded_state(
         self,
         path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
+        pattern: str | None = None,
+        max_size: int | None = None,
     ) -> None:
         self.model_executor.save_sharded_state(
             path=path, pattern=pattern, max_size=max_size
@@ -457,10 +457,10 @@ class EngineCore:
 
     def collective_rpc(
         self,
-        method: Union[str, Callable[..., _R]],
-        timeout: Optional[float] = None,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict[str, Any]] = None,
+        kwargs: dict[str, Any] | None = None,
     ) -> list[_R]:
         return self.model_executor.collective_rpc(method, timeout, args, kwargs)
 
@@ -509,11 +509,11 @@ class EngineCoreProc(EngineCore):
         handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
-        client_handshake_address: Optional[str] = None,
+        client_handshake_address: str | None = None,
         engine_index: int = 0,
     ):
         self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
-        self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs], bytes]]()
+        self.output_queue = queue.Queue[tuple[int, EngineCoreOutputs] | bytes]()
         executor_fail_callback = lambda: self.input_queue.put_nowait(
             (EngineCoreRequestType.EXECUTOR_FAILED, b"")
         )
@@ -606,7 +606,7 @@ class EngineCoreProc(EngineCore):
         identity: bytes,
         local_client: bool,
         vllm_config: VllmConfig,
-        client_handshake_address: Optional[str],
+        client_handshake_address: str | None,
     ) -> Generator[EngineZmqAddresses, None, None]:
         """
         Perform startup handshakes.
@@ -667,7 +667,7 @@ class EngineCoreProc(EngineCore):
         local_client: bool,
         headless: bool,
         vllm_config: VllmConfig,
-        parallel_config_to_update: Optional[ParallelConfig] = None,
+        parallel_config_to_update: ParallelConfig | None = None,
     ) -> Generator[EngineZmqAddresses, None, None]:
         with make_zmq_socket(
             ctx,
@@ -710,7 +710,7 @@ class EngineCoreProc(EngineCore):
         handshake_socket: zmq.Socket,
         local_client: bool,
         headless: bool,
-        parallel_config: Optional[ParallelConfig] = None,
+        parallel_config: ParallelConfig | None = None,
     ) -> EngineZmqAddresses:
         # Send registration message.
         handshake_socket.send(
@@ -765,7 +765,7 @@ class EngineCoreProc(EngineCore):
         signal.signal(signal.SIGTERM, signal_handler)
         signal.signal(signal.SIGINT, signal_handler)
 
-        engine_core: Optional[EngineCoreProc] = None
+        engine_core: EngineCoreProc | None = None
         try:
             parallel_config: ParallelConfig = kwargs["vllm_config"].parallel_config
             if parallel_config.data_parallel_size > 1 or dp_rank > 0:
@@ -911,7 +911,7 @@ class EngineCoreProc(EngineCore):
     def process_input_sockets(
         self,
         input_addresses: list[str],
-        coord_input_address: Optional[str],
+        coord_input_address: str | None,
         identity: bytes,
         ready_event: threading.Event,
     ):
@@ -980,7 +980,7 @@ class EngineCoreProc(EngineCore):
     def process_output_sockets(
         self,
         output_paths: list[str],
-        coord_output_path: Optional[str],
+        coord_output_path: str | None,
         engine_index: int,
     ):
         """Output socket IO thread."""
@@ -1059,7 +1059,7 @@ class DPEngineCoreProc(EngineCoreProc):
         handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
-        client_handshake_address: Optional[str] = None,
+        client_handshake_address: str | None = None,
     ):
         # Counts forward-passes of the model so that we can synchronize
         # finished with DP peers every N steps.
@@ -1332,7 +1332,7 @@ class DPEngineCoreActor(DPEngineCoreProc):
         identity: bytes,
         local_client: bool,
         vllm_config: VllmConfig,
-        client_handshake_address: Optional[str],
+        client_handshake_address: str | None,
     ):
         """
         For Ray, we don't need to actually perform handshake.
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 27283411eada9..c800d0d279af1 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -9,11 +9,11 @@ import uuid
 import weakref
 from abc import ABC, abstractmethod
 from collections import defaultdict, deque
-from collections.abc import Awaitable, Sequence
+from collections.abc import Awaitable, Callable, Sequence
 from concurrent.futures import Future
 from dataclasses import dataclass
 from threading import Thread
-from typing import Any, Callable, Optional, TypeVar, Union
+from typing import Any, TypeAlias, TypeVar
 
 import msgspec.msgpack
 import zmq
@@ -51,7 +51,7 @@ from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
 
 logger = init_logger(__name__)
 
-AnyFuture = Union[asyncio.Future[Any], Future[Any]]
+AnyFuture: TypeAlias = asyncio.Future[Any] | Future[Any]
 
 _R = TypeVar("_R")  # Return type for collective_rpc
 
@@ -99,7 +99,7 @@ class EngineCoreClient(ABC):
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
-        client_addresses: Optional[dict[str, str]] = None,
+        client_addresses: dict[str, str] | None = None,
         client_count: int = 1,
         client_index: int = 0,
     ) -> "MPClient":
@@ -144,7 +144,7 @@ class EngineCoreClient(ABC):
     def sleep(self, level: int = 1) -> None:
         raise NotImplementedError
 
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    def wake_up(self, tags: list[str] | None = None) -> None:
         raise NotImplementedError
 
     def is_sleeping(self) -> bool:
@@ -172,16 +172,16 @@ class EngineCoreClient(ABC):
         raise NotImplementedError
 
     def save_sharded_state(
-        self, path: str, pattern: Optional[str] = None, max_size: Optional[int] = None
+        self, path: str, pattern: str | None = None, max_size: int | None = None
     ) -> None:
         raise NotImplementedError
 
     def collective_rpc(
         self,
-        method: Union[str, Callable[..., _R]],
-        timeout: Optional[float] = None,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict[str, Any]] = None,
+        kwargs: dict[str, Any] | None = None,
     ) -> list[_R]:
         raise NotImplementedError
 
@@ -214,7 +214,7 @@ class EngineCoreClient(ABC):
     async def sleep_async(self, level: int = 1) -> None:
         raise NotImplementedError
 
-    async def wake_up_async(self, tags: Optional[list[str]] = None) -> None:
+    async def wake_up_async(self, tags: list[str] | None = None) -> None:
         raise NotImplementedError
 
     async def is_sleeping_async(self) -> bool:
@@ -236,16 +236,16 @@ class EngineCoreClient(ABC):
         raise NotImplementedError
 
     async def save_sharded_state_async(
-        self, path: str, pattern: Optional[str] = None, max_size: Optional[int] = None
+        self, path: str, pattern: str | None = None, max_size: int | None = None
     ) -> None:
         raise NotImplementedError
 
     async def collective_rpc_async(
         self,
-        method: Union[str, Callable[..., _R]],
-        timeout: Optional[float] = None,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict[str, Any]] = None,
+        kwargs: dict[str, Any] | None = None,
     ) -> list[_R]:
         raise NotImplementedError
 
@@ -293,7 +293,7 @@ class InprocClient(EngineCoreClient):
     def sleep(self, level: int = 1) -> None:
         self.engine_core.sleep(level)
 
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    def wake_up(self, tags: list[str] | None = None) -> None:
         self.engine_core.wake_up(tags)
 
     def is_sleeping(self) -> bool:
@@ -315,16 +315,16 @@ class InprocClient(EngineCoreClient):
         return self.engine_core.pin_lora(lora_id)
 
     def save_sharded_state(
-        self, path: str, pattern: Optional[str] = None, max_size: Optional[int] = None
+        self, path: str, pattern: str | None = None, max_size: int | None = None
     ) -> None:
         self.engine_core.save_sharded_state(path, pattern, max_size)
 
     def collective_rpc(
         self,
-        method: Union[str, Callable[..., _R]],
-        timeout: Optional[float] = None,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict[str, Any]] = None,
+        kwargs: dict[str, Any] | None = None,
     ) -> list[_R]:
         return self.engine_core.collective_rpc(method, timeout, args, kwargs)
 
@@ -340,18 +340,16 @@ class BackgroundResources:
     ctx: zmq.Context
     # If CoreEngineProcManager, it manages local engines;
     # if CoreEngineActorManager, it manages all engines.
-    engine_manager: Optional[Union[CoreEngineProcManager, CoreEngineActorManager]] = (
-        None
-    )
-    coordinator: Optional[DPCoordinator] = None
-    output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
-    input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
-    first_req_send_socket: Optional[zmq.asyncio.Socket] = None
-    first_req_rcv_socket: Optional[zmq.asyncio.Socket] = None
-    stats_update_socket: Optional[zmq.asyncio.Socket] = None
-    output_queue_task: Optional[asyncio.Task] = None
-    stats_update_task: Optional[asyncio.Task] = None
-    shutdown_path: Optional[str] = None
+    engine_manager: CoreEngineProcManager | CoreEngineActorManager | None = None
+    coordinator: DPCoordinator | None = None
+    output_socket: zmq.Socket | zmq.asyncio.Socket | None = None
+    input_socket: zmq.Socket | zmq.asyncio.Socket | None = None
+    first_req_send_socket: zmq.asyncio.Socket | None = None
+    first_req_rcv_socket: zmq.asyncio.Socket | None = None
+    stats_update_socket: zmq.asyncio.Socket | None = None
+    output_queue_task: asyncio.Task | None = None
+    stats_update_task: asyncio.Task | None = None
+    shutdown_path: str | None = None
 
     # Set if any of the engines are dead. Here so that the output
     # processing threads can access it without holding a ref to the client.
@@ -438,7 +436,7 @@ class MPClient(EngineCoreClient):
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
-        client_addresses: Optional[dict[str, str]] = None,
+        client_addresses: dict[str, str] | None = None,
     ):
         self.vllm_config = vllm_config
         # Serialization setup.
@@ -459,7 +457,7 @@ class MPClient(EngineCoreClient):
             # State used for data parallel.
             self.engines_running = False
 
-            self.stats_update_address: Optional[str] = None
+            self.stats_update_address: str | None = None
             if client_addresses:
                 # Engines are managed externally to this client.
                 input_address = client_addresses["input_address"]
@@ -646,7 +644,7 @@ class SyncMPClient(MPClient):
         )
 
         self.is_dp = self.vllm_config.parallel_config.data_parallel_size > 1
-        self.outputs_queue = queue.Queue[Union[EngineCoreOutputs, Exception]]()
+        self.outputs_queue = queue.Queue[EngineCoreOutputs | Exception]()
 
         # Ensure that the outputs socket processing thread does not have
         # a ref to the client which prevents gc.
@@ -770,7 +768,7 @@ class SyncMPClient(MPClient):
     def sleep(self, level: int = 1) -> None:
         self.call_utility("sleep", level)
 
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    def wake_up(self, tags: list[str] | None = None) -> None:
         self.call_utility("wake_up", tags)
 
     def is_sleeping(self) -> bool:
@@ -781,15 +779,15 @@ class SyncMPClient(MPClient):
 
     def collective_rpc(
         self,
-        method: Union[str, Callable[..., _R]],
-        timeout: Optional[float] = None,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict[str, Any]] = None,
+        kwargs: dict[str, Any] | None = None,
     ) -> list[_R]:
         return self.call_utility("collective_rpc", method, timeout, args, kwargs)
 
     def save_sharded_state(
-        self, path: str, pattern: Optional[str] = None, max_size: Optional[int] = None
+        self, path: str, pattern: str | None = None, max_size: int | None = None
     ) -> None:
         self.call_utility("save_sharded_state", path, pattern, max_size)
 
@@ -802,7 +800,7 @@ class AsyncMPClient(MPClient):
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
-        client_addresses: Optional[dict[str, str]] = None,
+        client_addresses: dict[str, str] | None = None,
         client_count: int = 1,
         client_index: int = 0,
     ):
@@ -816,7 +814,7 @@ class AsyncMPClient(MPClient):
 
         self.client_count = client_count
         self.client_index = client_index
-        self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs, Exception]]()
+        self.outputs_queue = asyncio.Queue[EngineCoreOutputs | Exception]()
         try:
             # If we are running in an asyncio event loop, start the queue task.
             # Otherwise, it will be started lazily. If it is not started here,
@@ -837,9 +835,9 @@ class AsyncMPClient(MPClient):
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
-        output_handler: Optional[
-            Callable[[AsyncMPClient, EngineCoreOutputs], Awaitable[None]]
-        ] = getattr(self.__class__, "process_engine_outputs", None)
+        output_handler: (
+            Callable[[AsyncMPClient, EngineCoreOutputs], Awaitable[None]] | None
+        ) = getattr(self.__class__, "process_engine_outputs", None)
         _self_ref = weakref.ref(self) if output_handler else None
         output_socket = resources.output_socket
         assert output_socket is not None
@@ -888,7 +886,7 @@ class AsyncMPClient(MPClient):
         self,
         request_type: EngineCoreRequestType,
         request: Any,
-        engine: Optional[EngineIdentity] = None,
+        engine: EngineIdentity | None = None,
     ) -> Awaitable[Any]:
         if engine is None:
             engine = self.core_engine
@@ -962,7 +960,7 @@ class AsyncMPClient(MPClient):
     async def sleep_async(self, level: int = 1) -> None:
         await self.call_utility_async("sleep", level)
 
-    async def wake_up_async(self, tags: Optional[list[str]] = None) -> None:
+    async def wake_up_async(self, tags: list[str] | None = None) -> None:
         await self.call_utility_async("wake_up", tags)
 
     async def is_sleeping_async(self) -> bool:
@@ -984,16 +982,16 @@ class AsyncMPClient(MPClient):
         return await self.call_utility_async("pin_lora", lora_id)
 
     async def save_sharded_state_async(
-        self, path: str, pattern: Optional[str] = None, max_size: Optional[int] = None
+        self, path: str, pattern: str | None = None, max_size: int | None = None
     ) -> None:
         await self.call_utility_async("save_sharded_state", path, pattern, max_size)
 
     async def collective_rpc_async(
         self,
-        method: Union[str, Callable[..., _R]],
-        timeout: Optional[float] = None,
+        method: str | Callable[..., _R],
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict[str, Any]] = None,
+        kwargs: dict[str, Any] | None = None,
     ) -> list[_R]:
         return await self.call_utility_async(
             "collective_rpc", method, timeout, args, kwargs
@@ -1009,7 +1007,7 @@ class DPAsyncMPClient(AsyncMPClient):
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
-        client_addresses: Optional[dict[str, str]] = None,
+        client_addresses: dict[str, str] | None = None,
         client_count: int = 1,
         client_index: int = 0,
     ):
@@ -1166,7 +1164,7 @@ class DPLBAsyncMPClient(DPAsyncMPClient):
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
-        client_addresses: Optional[dict[str, str]] = None,
+        client_addresses: dict[str, str] | None = None,
         client_count: int = 1,
         client_index: int = 0,
     ):
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 5efde9e2ff878..5f66e36893bf3 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import tokenizers
 from packaging import version
@@ -36,7 +35,7 @@ class IncrementalDetokenizer:
     def output_token_ids(self) -> list[int]:
         return self.token_ids
 
-    def update(self, new_token_ids: list[int], stop_terminated: bool) -> Optional[str]:
+    def update(self, new_token_ids: list[int], stop_terminated: bool) -> str | None:
         self.token_ids.extend(new_token_ids)
         return None
 
@@ -46,7 +45,7 @@ class IncrementalDetokenizer:
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: Optional[AnyTokenizer],
+        tokenizer: AnyTokenizer | None,
         request: EngineCoreRequest,
     ) -> "IncrementalDetokenizer":
         assert request.sampling_params is not None
@@ -85,7 +84,7 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC):
         # Generation data
         self.output_text = ""
 
-    def update(self, new_token_ids: list[int], stop_terminated: bool) -> Optional[str]:
+    def update(self, new_token_ids: list[int], stop_terminated: bool) -> str | None:
         """
         Update RequestState for the request_id by:
             1) Detokenize the new token ids incrementally.
@@ -224,7 +223,7 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer):
 
         return token or ""
 
-    def _protected_step(self, next_token_id: int) -> Optional[str]:
+    def _protected_step(self, next_token_id: int) -> str | None:
         try:
             token = self.stream.step(self.tokenizer, next_token_id)
         except (OverflowError, TypeError):
@@ -312,7 +311,7 @@ def check_stop_strings(
     new_char_count: int,
     stop: list[str],
     include_in_output: bool,
-) -> Optional[tuple[str, int]]:
+) -> tuple[str, int] | None:
     """Check if any stop strings are matched and truncate sequence
     output text accordingly.
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index b2261855d125c..debf8a2192548 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -2,9 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
-from collections.abc import Mapping
+from collections.abc import Callable, Mapping
 from copy import copy
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import torch.nn as nn
 from typing_extensions import TypeVar
@@ -52,7 +52,7 @@ class LLMEngine:
         executor_class: type[Executor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        stat_loggers: list[StatLoggerFactory] | None = None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
@@ -126,7 +126,7 @@ class LLMEngine:
             log_stats=self.log_stats,
         )
 
-        self.logger_manager: Optional[StatLoggerManager] = None
+        self.logger_manager: StatLoggerManager | None = None
         if self.log_stats:
             self.logger_manager = StatLoggerManager(
                 vllm_config=vllm_config,
@@ -152,7 +152,7 @@ class LLMEngine:
         cls,
         vllm_config: VllmConfig,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        stat_loggers: list[StatLoggerFactory] | None = None,
         disable_log_stats: bool = False,
     ) -> "LLMEngine":
         return cls(
@@ -169,7 +169,7 @@ class LLMEngine:
         cls,
         engine_args: EngineArgs,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        stat_loggers: list[StatLoggerFactory] | None = None,
         enable_multiprocessing: bool = False,
     ) -> "LLMEngine":
         """Creates an LLM engine from the engine arguments."""
@@ -225,14 +225,14 @@ class LLMEngine:
     def add_request(
         self,
         request_id: str,
-        prompt: Union[EngineCoreRequest, PromptType],
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt: EngineCoreRequest | PromptType,
+        params: SamplingParams | PoolingParams,
+        arrival_time: float | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
-        prompt_text: Optional[str] = None,
+        prompt_text: str | None = None,
     ) -> None:
         # Validate the request_id type.
         if not isinstance(request_id, str):
@@ -283,7 +283,7 @@ class LLMEngine:
             # Add the request to EngineCore.
             self.engine_core.add_request(child_request)
 
-    def step(self) -> Union[list[RequestOutput], list[PoolingRequestOutput]]:
+    def step(self) -> list[RequestOutput] | list[PoolingRequestOutput]:
         if self.should_execute_dummy_batch:
             self.should_execute_dummy_batch = False
             self.engine_core.execute_dummy_batch()
@@ -326,13 +326,13 @@ class LLMEngine:
         self.processor.clear_mm_cache()
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self, device: Optional[Device] = None):
+    def reset_prefix_cache(self, device: Device | None = None):
         self.engine_core.reset_prefix_cache()
 
     def sleep(self, level: int = 1):
         self.engine_core.sleep(level)
 
-    def wake_up(self, tags: Optional[list[str]] = None):
+    def wake_up(self, tags: list[str] | None = None):
         self.engine_core.wake_up(tags)
 
     def is_sleeping(self) -> bool:
@@ -343,11 +343,11 @@ class LLMEngine:
         return get_metrics_snapshot()
 
     @property
-    def tokenizer(self) -> Optional[AnyTokenizer]:
+    def tokenizer(self) -> AnyTokenizer | None:
         return self.processor.tokenizer
 
     @tokenizer.setter
-    def tokenizer(self, tokenizer: Optional[AnyTokenizer]) -> None:
+    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
         self.processor.tokenizer = tokenizer
 
     def get_tokenizer(self) -> AnyTokenizer:
@@ -390,10 +390,10 @@ class LLMEngine:
 
     def collective_rpc(
         self,
-        method: Union[str, Callable[[WorkerBase], _R]],
-        timeout: Optional[float] = None,
+        method: str | Callable[[WorkerBase], _R],
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict[str, Any]] = None,
+        kwargs: dict[str, Any] | None = None,
     ) -> list[_R]:
         return self.engine_core.collective_rpc(method, timeout, args, kwargs)
 
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index ab0e44fce1558..2cc2df16e413b 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -4,7 +4,6 @@
 import itertools
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Optional
 
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
@@ -24,19 +23,19 @@ NONES = itertools.repeat(None)
 class LogprobsProcessor:
     # Tokenizer for this request,
     # None if detokenization is disabled.
-    tokenizer: Optional[AnyTokenizer]
+    tokenizer: AnyTokenizer | None
 
     # Logprobs for this request
-    logprobs: Optional[SampleLogprobs]
-    prompt_logprobs: Optional[PromptLogprobs]
-    cumulative_logprob: Optional[float]
-    num_logprobs: Optional[int]
-    num_prompt_logprobs: Optional[int]
+    logprobs: SampleLogprobs | None
+    prompt_logprobs: PromptLogprobs | None
+    cumulative_logprob: float | None
+    num_logprobs: int | None
+    num_prompt_logprobs: int | None
 
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: Optional[AnyTokenizer],
+        tokenizer: AnyTokenizer | None,
         request: EngineCoreRequest,
     ) -> "LogprobsProcessor":
         assert request.sampling_params is not None
@@ -148,7 +147,7 @@ class LogprobsProcessor:
                 )
             )
 
-    def pop_prompt_logprobs(self) -> Optional[PromptLogprobs]:
+    def pop_prompt_logprobs(self) -> PromptLogprobs | None:
         """Pop and return all request prompt logprobs
 
         The logprobs processor aggregates prompt chunk logprobs
@@ -171,7 +170,7 @@ class LogprobsProcessor:
     def _make_logprob_dict(
         logprobs: list[float],
         logprob_token_ids: list[int],
-        decoded_tokens: Iterable[Optional[str]],
+        decoded_tokens: Iterable[str | None],
         rank: int,
         num_logprobs: int,
     ) -> dict[int, Logprob]:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index eb65b68969e35..2bc1542187c9b 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -4,7 +4,7 @@
 import asyncio
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Any, Optional, Union, cast
+from typing import Any, cast
 
 import torch
 
@@ -36,14 +36,10 @@ class RequestOutputCollector:
 
     def __init__(self, output_kind: RequestOutputKind):
         self.aggregate = output_kind == RequestOutputKind.DELTA
-        self.output: Optional[Union[RequestOutput, PoolingRequestOutput, Exception]] = (
-            None
-        )
+        self.output: RequestOutput | PoolingRequestOutput | Exception | None = None
         self.ready = asyncio.Event()
 
-    def put(
-        self, output: Union[RequestOutput, PoolingRequestOutput, Exception]
-    ) -> None:
+    def put(self, output: RequestOutput | PoolingRequestOutput | Exception) -> None:
         """Non-blocking put operation."""
         if self.output is None or isinstance(output, Exception):
             self.output = output
@@ -53,7 +49,7 @@ class RequestOutputCollector:
             # (if n > 1) do not override each other.
             self.output.add(output, aggregate=self.aggregate)
 
-    async def get(self) -> Union[RequestOutput, PoolingRequestOutput]:
+    async def get(self) -> RequestOutput | PoolingRequestOutput:
         """Get operation blocks on put event."""
         while (output := self.output) is None:
             await self.ready.wait()
@@ -63,7 +59,7 @@ class RequestOutputCollector:
             raise output
         return output
 
-    def get_nowait(self) -> Optional[Union[RequestOutput, PoolingRequestOutput]]:
+    def get_nowait(self) -> RequestOutput | PoolingRequestOutput | None:
         """Non-blocking get operation."""
         output = self.output
         if output is not None:
@@ -76,7 +72,7 @@ class RequestOutputCollector:
 
 @dataclass
 class OutputProcessorOutput:
-    request_outputs: list[Union[RequestOutput, PoolingRequestOutput]]
+    request_outputs: list[RequestOutput | PoolingRequestOutput]
     reqs_to_abort: list[str]
 
 
@@ -84,22 +80,22 @@ class RequestState:
     def __init__(
         self,
         request_id: str,
-        parent_req: Optional[ParentRequest],
+        parent_req: ParentRequest | None,
         request_index: int,
-        lora_name: Optional[str],
+        lora_name: str | None,
         output_kind: RequestOutputKind,
-        prompt: Optional[str],
-        prompt_token_ids: Optional[list[int]],
-        prompt_embeds: Optional[torch.Tensor],
-        logprobs_processor: Optional[LogprobsProcessor],
-        detokenizer: Optional[IncrementalDetokenizer],
-        max_tokens_param: Optional[int],
+        prompt: str | None,
+        prompt_token_ids: list[int] | None,
+        prompt_embeds: torch.Tensor | None,
+        logprobs_processor: LogprobsProcessor | None,
+        detokenizer: IncrementalDetokenizer | None,
+        max_tokens_param: int | None,
         arrival_time: float,
-        queue: Optional[RequestOutputCollector],
+        queue: RequestOutputCollector | None,
         log_stats: bool,
-        top_p: Optional[float] = None,
-        n: Optional[int] = None,
-        temperature: Optional[float] = None,
+        top_p: float | None = None,
+        n: int | None = None,
+        temperature: float | None = None,
     ):
         self.request_id = request_id
         self.parent_req = parent_req
@@ -129,10 +125,10 @@ class RequestState:
         cls,
         tokenizer: AnyTokenizer,
         request: EngineCoreRequest,
-        prompt: Optional[str],
-        parent_req: Optional[ParentRequest],
+        prompt: str | None,
+        parent_req: ParentRequest | None,
         request_index: int,
-        queue: Optional[RequestOutputCollector],
+        queue: RequestOutputCollector | None,
         log_stats: bool,
     ) -> "RequestState":
         if sampling_params := request.sampling_params:
@@ -186,11 +182,11 @@ class RequestState:
     def make_request_output(
         self,
         new_token_ids: list[int],
-        pooling_output: Optional[torch.Tensor],
-        finish_reason: Optional[FinishReason],
-        stop_reason: Union[int, str, None],
-        kv_transfer_params: Optional[dict[str, Any]] = None,
-    ) -> Optional[Union[RequestOutput, PoolingRequestOutput]]:
+        pooling_output: torch.Tensor | None,
+        finish_reason: FinishReason | None,
+        stop_reason: int | str | None,
+        kv_transfer_params: dict[str, Any] | None = None,
+    ) -> RequestOutput | PoolingRequestOutput | None:
         finished = finish_reason is not None
         final_only = self.output_kind == RequestOutputKind.FINAL_ONLY
 
@@ -222,10 +218,10 @@ class RequestState:
     def _new_request_output(
         self,
         request_id: str,
-        outputs: Union[list[CompletionOutput], list[PoolingOutput]],
+        outputs: list[CompletionOutput] | list[PoolingOutput],
         finished: bool,
-        kv_transfer_params: Optional[dict[str, Any]] = None,
-    ) -> Union[RequestOutput, PoolingRequestOutput]:
+        kv_transfer_params: dict[str, Any] | None = None,
+    ) -> RequestOutput | PoolingRequestOutput:
         first_output = outputs[0]
         if isinstance(first_output, PoolingOutput):
             assert len(outputs) == 1
@@ -264,8 +260,8 @@ class RequestState:
     def _new_completion_output(
         self,
         token_ids: list[int],
-        finish_reason: Optional[FinishReason],
-        stop_reason: Union[int, str, None],
+        finish_reason: FinishReason | None,
+        stop_reason: int | str | None,
     ) -> CompletionOutput:
         assert self.detokenizer is not None
         assert self.logprobs_processor is not None
@@ -308,7 +304,7 @@ class OutputProcessor:
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates()
-        self.tracer: Optional[Tracer] = None
+        self.tracer: Tracer | None = None
 
     def get_num_unfinished_requests(self):
         return len(self.request_states)
@@ -360,10 +356,10 @@ class OutputProcessor:
     def add_request(
         self,
         request: EngineCoreRequest,
-        prompt: Optional[str],
-        parent_req: Optional[ParentRequest] = None,
+        prompt: str | None,
+        parent_req: ParentRequest | None = None,
         request_index: int = 0,
-        queue: Optional[RequestOutputCollector] = None,
+        queue: RequestOutputCollector | None = None,
     ) -> None:
         request_id = request.request_id
         if request_id in self.request_states:
@@ -386,8 +382,8 @@ class OutputProcessor:
     def process_outputs(
         self,
         engine_core_outputs: list[EngineCoreOutput],
-        engine_core_timestamp: Optional[float] = None,
-        iteration_stats: Optional[IterationStats] = None,
+        engine_core_timestamp: float | None = None,
+        iteration_stats: IterationStats | None = None,
     ) -> OutputProcessorOutput:
         """
         Process the EngineCoreOutputs:
@@ -411,7 +407,7 @@ class OutputProcessor:
         within the loop below.
         """
 
-        request_outputs: Union[list[RequestOutput], list[PoolingRequestOutput]] = []
+        request_outputs: list[RequestOutput] | list[PoolingRequestOutput] = []
         reqs_to_abort: list[str] = []
         for engine_core_output in engine_core_outputs:
             req_id = engine_core_output.request_id
@@ -492,7 +488,7 @@ class OutputProcessor:
         self,
         engine_core_output: EngineCoreOutput,
         req_state: RequestState,
-        iteration_stats: Optional[IterationStats],
+        iteration_stats: IterationStats | None,
     ) -> None:
         assert req_state.stats is not None
         assert iteration_stats is not None
@@ -555,8 +551,8 @@ class OutputProcessor:
         self,
         req_state: RequestState,
         engine_core_output: EngineCoreOutput,
-        engine_core_timestamp: Optional[float],
-        iteration_stats: Optional[IterationStats],
+        engine_core_timestamp: float | None,
+        iteration_stats: IterationStats | None,
     ):
         if iteration_stats is None:
             return
@@ -577,8 +573,8 @@ class OutputProcessor:
     def _update_stats_from_finished(
         self,
         req_state: RequestState,
-        finish_reason: Optional[FinishReason],
-        iteration_stats: Optional[IterationStats],
+        finish_reason: FinishReason | None,
+        iteration_stats: IterationStats | None,
     ):
         if iteration_stats is None:
             return
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index daf115c0325ff..2a47befec25f1 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -29,7 +29,7 @@ class ParentRequest:
     max_num_generation_tokens: int
 
     # To efficiently obtain child sampling params
-    cached_child_sampling_params: Optional[SamplingParams]
+    cached_child_sampling_params: SamplingParams | None
 
     def __init__(self, request_id: str, sampling_params: SamplingParams) -> None:
         self.request_id = request_id
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index d106783d6dc12..de15677aeea91 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -3,7 +3,7 @@
 
 import time
 from collections.abc import Mapping
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal
 
 from vllm.config import VllmConfig
 from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
@@ -38,7 +38,7 @@ class Processor:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        tokenizer: Optional[AnyTokenizer],
+        tokenizer: AnyTokenizer | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ) -> None:
         self.vllm_config = vllm_config
@@ -60,11 +60,11 @@ class Processor:
         )
 
     @property
-    def tokenizer(self) -> Optional[AnyTokenizer]:
+    def tokenizer(self) -> AnyTokenizer | None:
         return self.input_preprocessor.tokenizer
 
     @tokenizer.setter
-    def tokenizer(self, tokenizer: Optional[AnyTokenizer]) -> None:
+    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
         self.input_preprocessor.tokenizer = tokenizer
 
     def _validate_logprobs(
@@ -152,7 +152,7 @@ class Processor:
 
     def _validate_params(
         self,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams | PoolingParams,
     ):
         """
         Validate supported SamplingParam.
@@ -174,7 +174,7 @@ class Processor:
         auto-hashed downstream.
         """
 
-        def _validate_single_prompt(single_prompt: Union[dict, str]) -> None:
+        def _validate_single_prompt(single_prompt: dict | str) -> None:
             if not isinstance(single_prompt, dict):
                 return
             mm_data = single_prompt.get("multi_modal_data")
@@ -214,7 +214,7 @@ class Processor:
         else:
             _validate_single_prompt(prompt)  # type: ignore[arg-type]
 
-    def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
+    def _validate_lora(self, lora_request: LoRARequest | None) -> None:
         if lora_request is None:
             return
 
@@ -309,7 +309,7 @@ class Processor:
         self,
         request_id: str,
         prompt: PromptType,
-    ) -> Optional[MultiModalUUIDDict]:
+    ) -> MultiModalUUIDDict | None:
         """Build per-item multimodal hash overrides when enabled. In this case,
         multimodal data items are identified by their request id, modality and
         index rather than their content.
@@ -342,13 +342,13 @@ class Processor:
         self,
         request_id: str,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
+        params: SamplingParams | PoolingParams,
+        arrival_time: float | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
         priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
+        data_parallel_rank: int | None = None,
     ) -> EngineCoreRequest:
         self._validate_lora(lora_request)
         self._validate_params(params)
@@ -445,7 +445,7 @@ class Processor:
             pooling_params = params.clone()
 
         # Multimodal related.
-        mm_features: Optional[list[MultiModalFeatureSpec]] = None
+        mm_features: list[MultiModalFeatureSpec] | None = None
 
         if decoder_inputs["type"] == "multimodal":
             decoder_mm_inputs = decoder_inputs["mm_kwargs"]
@@ -485,7 +485,7 @@ class Processor:
         )
 
     def _validate_model_inputs(
-        self, encoder_inputs: Optional[SingletonInputs], decoder_inputs: SingletonInputs
+        self, encoder_inputs: SingletonInputs | None, decoder_inputs: SingletonInputs
     ):
         if encoder_inputs is not None:
             self._validate_model_input(encoder_inputs, prompt_type="encoder")
@@ -574,7 +574,7 @@ class Processor:
             # check that chunked prefill does not truncate them
             # max_batch_len = self.scheduler_config.max_num_batched_tokens
 
-    def stat_mm_cache(self) -> Optional[MultiModalCacheStats]:
+    def stat_mm_cache(self) -> MultiModalCacheStats | None:
         return self.input_preprocessor.stat_mm_cache()
 
     def clear_mm_cache(self) -> None:
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index ac2a6b997e9fe..e617abf6b2c7d 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -4,12 +4,12 @@
 import contextlib
 import os
 import weakref
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from dataclasses import dataclass
 from enum import Enum, auto
 from multiprocessing import Process, connection
 from multiprocessing.process import BaseProcess
-from typing import TYPE_CHECKING, Callable, Optional, Union
+from typing import TYPE_CHECKING
 from unittest.mock import patch
 
 import msgspec
@@ -56,13 +56,13 @@ class EngineZmqAddresses:
     # ZMQ output socket addresses for each front-end client (responses)
     outputs: list[str]
     # ZMQ input socket address of DP coordinator if applicable
-    coordinator_input: Optional[str] = None
+    coordinator_input: str | None = None
     # ZMQ output socket address of DP coordinator if applicable
-    coordinator_output: Optional[str] = None
+    coordinator_output: str | None = None
     # ZMQ socket for front-end to connect to DP coordinator.
     # Not used by engine, just relayed to front-end in handshake response.
     # Only required for external DP LB case.
-    frontend_stats_publish_address: Optional[str] = None
+    frontend_stats_publish_address: str | None = None
 
 
 @dataclass
@@ -73,8 +73,8 @@ class EngineHandshakeMetadata:
     """
 
     addresses: EngineZmqAddresses
-    parallel_config: dict[str, Union[int, str, list[int]]]
-    parallel_config_hash: Optional[str] = None
+    parallel_config: dict[str, int | str | list[int]]
+    parallel_config_hash: str | None = None
 
 
 class CoreEngineProcManager:
@@ -94,7 +94,7 @@ class CoreEngineProcManager:
         handshake_address: str,
         executor_class: type[Executor],
         log_stats: bool,
-        client_handshake_address: Optional[str] = None,
+        client_handshake_address: str | None = None,
     ):
         context = get_mp_context()
         common_kwargs = {
@@ -221,8 +221,8 @@ class CoreEngineActorManager:
         addresses: EngineZmqAddresses,
         executor_class: type[Executor],
         log_stats: bool,
-        placement_groups: Optional[list["PlacementGroup"]] = None,
-        local_dp_ranks: Optional[list[int]] = None,
+        placement_groups: list["PlacementGroup"] | None = None,
+        local_dp_ranks: list[int] | None = None,
     ):
         import copy
 
@@ -675,8 +675,8 @@ def launch_core_engines(
     num_api_servers: int = 1,
 ) -> Iterator[
     tuple[
-        Optional[Union[CoreEngineProcManager, CoreEngineActorManager]],
-        Optional[DPCoordinator],
+        CoreEngineProcManager | CoreEngineActorManager | None,
+        DPCoordinator | None,
         EngineZmqAddresses,
     ]
 ]:
@@ -829,8 +829,8 @@ def wait_for_engine_startup(
     core_engines: list[CoreEngine],
     parallel_config: ParallelConfig,
     cache_config: CacheConfig,
-    proc_manager: Optional[CoreEngineProcManager],
-    coord_process: Optional[Process],
+    proc_manager: CoreEngineProcManager | None,
+    coord_process: Process | None,
 ):
     # Wait for engine core process(es) to send ready messages.
     local_count = parallel_config.data_parallel_size_local
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 064e4b2bbf181..53617645f52cf 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Callable
 from concurrent.futures import Future
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -91,10 +92,10 @@ class Executor(ExecutorBase):
 
     def collective_rpc(
         self,
-        method: Union[str, Callable],
-        timeout: Optional[float] = None,
+        method: str | Callable,
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict] = None,
+        kwargs: dict | None = None,
         non_block: bool = False,
     ) -> list[Any]:
         raise NotImplementedError
@@ -103,7 +104,7 @@ class Executor(ExecutorBase):
         self,
         scheduler_output: SchedulerOutput,
         non_block: bool = False,
-    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
+    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
         output = self.collective_rpc(
             "execute_model", args=(scheduler_output,), non_block=non_block
         )
@@ -112,7 +113,7 @@ class Executor(ExecutorBase):
     def execute_dummy_batch(self) -> None:
         self.collective_rpc("execute_dummy_batch")
 
-    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
         output = self.collective_rpc("take_draft_token_ids")
         return output[0]
 
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index d92c8f38571e9..e28d29c19a9c6 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -9,6 +9,7 @@ import threading
 import time
 import traceback
 import weakref
+from collections.abc import Callable
 from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
 from enum import Enum, auto
@@ -17,7 +18,7 @@ from multiprocessing.connection import Connection
 from multiprocessing.process import BaseProcess
 from multiprocessing.synchronize import Lock as LockType
 from threading import Thread
-from typing import Any, Callable, Optional, Union, cast
+from typing import Any, cast
 
 import cloudpickle
 import torch
@@ -59,8 +60,8 @@ class MultiprocExecutor(Executor):
         self._finalizer = weakref.finalize(self, self.shutdown)
         self.is_failed = False
         self.shutdown_event = threading.Event()
-        self.failure_callback: Optional[FailureCallback] = None
-        self.io_thread_pool: Optional[ThreadPoolExecutor] = None
+        self.failure_callback: FailureCallback | None = None
+        self.io_thread_pool: ThreadPoolExecutor | None = None
 
         self.world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
@@ -179,7 +180,7 @@ class MultiprocExecutor(Executor):
         self,
         scheduler_output: SchedulerOutput,
         non_block: bool = False,
-    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
+    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
         if not self.has_connector:
             # get output only from a single worker (output_rank)
             (output,) = self.collective_rpc(
@@ -207,7 +208,7 @@ class MultiprocExecutor(Executor):
     def execute_dummy_batch(self) -> None:
         self.collective_rpc("execute_dummy_batch", unique_reply_rank=self.output_rank)
 
-    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
         # OPTIMIZATION: Get output only from a single worker (output_rank)
         outputs = self.collective_rpc(
             "take_draft_token_ids", unique_reply_rank=self.output_rank
@@ -216,12 +217,12 @@ class MultiprocExecutor(Executor):
 
     def collective_rpc(
         self,
-        method: Union[str, Callable],
-        timeout: Optional[float] = None,
+        method: str | Callable,
+        timeout: float | None = None,
         args: tuple = (),
-        kwargs: Optional[dict] = None,
+        kwargs: dict | None = None,
         non_block: bool = False,
-        unique_reply_rank: Optional[int] = None,
+        unique_reply_rank: int | None = None,
     ) -> list[Any]:
         if self.is_failed:
             raise RuntimeError("Executor failed.")
@@ -252,8 +253,8 @@ class MultiprocExecutor(Executor):
 
             def get_response(
                 w: WorkerProcHandle,
-                dequeue_timeout: Optional[float] = None,
-                cancel_event: Optional[threading.Event] = None,
+                dequeue_timeout: float | None = None,
+                cancel_event: threading.Event | None = None,
             ):
                 status, result = w.worker_response_mq.dequeue(
                     timeout=dequeue_timeout, cancel=cancel_event
@@ -370,7 +371,7 @@ class UnreadyWorkerProcHandle:
     proc: BaseProcess
     rank: int
     ready_pipe: Connection
-    death_writer: Optional[Connection] = None
+    death_writer: Connection | None = None
 
 
 @dataclass
@@ -378,7 +379,7 @@ class WorkerProcHandle:
     proc: BaseProcess
     rank: int
     worker_response_mq: MessageQueue  # The worker process writes to this MQ
-    death_writer: Optional[Connection] = None
+    death_writer: Connection | None = None
 
     @classmethod
     def from_unready_handle(
@@ -505,7 +506,7 @@ class WorkerProc:
         )
 
         pipes = {handle.ready_pipe: handle for handle in unready_proc_handles}
-        ready_proc_handles: list[Optional[WorkerProcHandle]] = [None] * len(
+        ready_proc_handles: list[WorkerProcHandle | None] = [None] * len(
             unready_proc_handles
         )
         while pipes:
@@ -674,7 +675,7 @@ class WorkerProc:
             output = self.async_output_queue.get()
             self.enqueue_output(output)
 
-    def worker_busy_loop(self, cancel: Optional[threading.Event] = None):
+    def worker_busy_loop(self, cancel: threading.Event | None = None):
         """Main busy loop for Multiprocessing Workers"""
         while True:
             method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
index e2c2bfd45d7bd..586df591bfd83 100644
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from concurrent.futures import Future
-from typing import Optional, Union
 
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.executor.ray_distributed_executor import (  # noqa
@@ -26,7 +25,7 @@ class FutureWrapper(Future):
     the result() call. If not only the first worker's output is returned.
     """
 
-    def __init__(self, refs, aggregator: Optional[KVOutputAggregator] = None):
+    def __init__(self, refs, aggregator: KVOutputAggregator | None = None):
         super().__init__()
         self.refs = refs
         self.aggregator = aggregator
@@ -66,7 +65,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
         self,
         scheduler_output: SchedulerOutput,
         non_block: bool = False,
-    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
+    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
         """Execute the model on the Ray workers.
 
         Args:
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 9c28eb92c17a9..a9ef1b92c2433 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -4,7 +4,6 @@
 import copy
 from dataclasses import dataclass, fields
 from math import prod
-from typing import Optional
 
 import torch
 from typing_extensions import Self
@@ -74,8 +73,8 @@ class AttentionSpec(KVCacheSpec):
 
 @dataclass(frozen=True)
 class FullAttentionSpec(AttentionSpec):
-    sliding_window: Optional[int] = None
-    attention_chunk_size: Optional[int] = None
+    sliding_window: int | None = None
+    attention_chunk_size: int | None = None
     """
     When hybrid allocator is disabled and the model contains both full 
     attention layers and sliding window attention layers, sliding 
@@ -96,7 +95,7 @@ class FullAttentionSpec(AttentionSpec):
         return cdiv(max_model_len, self.block_size) * self.page_size_bytes
 
     @classmethod
-    def merge_window_sizes(cls, window_sizes: set[int]) -> Optional[int]:
+    def merge_window_sizes(cls, window_sizes: set[int]) -> int | None:
         if len(window_sizes) == 0:
             return None
         elif len(window_sizes) == 1:
@@ -154,7 +153,7 @@ class FullAttentionSpec(AttentionSpec):
 @dataclass(frozen=True)
 class MLAAttentionSpec(FullAttentionSpec):
     # TODO(Lucas/Chen): less hacky way to do this
-    cache_dtype_str: Optional[str] = None
+    cache_dtype_str: str | None = None
 
     @property
     def page_size_bytes(self) -> int:
@@ -237,7 +236,7 @@ class SlidingWindowSpec(AttentionSpec):
 class MambaSpec(KVCacheSpec):
     shapes: tuple[tuple[int, ...], ...]
     dtypes: tuple[torch.dtype]
-    page_size_padded: Optional[int] = None
+    page_size_padded: int | None = None
     mamba_type: str = "mamba2"
     num_speculative_blocks: int = 0
 
@@ -342,7 +341,7 @@ class UniformTypeKVCacheSpecs(KVCacheSpec):
             )
 
     @classmethod
-    def from_specs(cls, kv_cache_specs: dict[str, KVCacheSpec]) -> Optional[Self]:
+    def from_specs(cls, kv_cache_specs: dict[str, KVCacheSpec]) -> Self | None:
         """
         Return a SameTypeKVCacheSpecs object if all layers have the same type
         of KV cache spec. Return None if not.
diff --git a/vllm/v1/kv_offload/abstract.py b/vllm/v1/kv_offload/abstract.py
index ce2d0dffc0ff6..c1d1cbebc1752 100644
--- a/vllm/v1/kv_offload/abstract.py
+++ b/vllm/v1/kv_offload/abstract.py
@@ -30,7 +30,6 @@ The class provides the following primitives:
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Optional
 
 from vllm.v1.core.kv_cache_utils import BlockHash
 
@@ -122,7 +121,7 @@ class OffloadingManager(ABC):
     @abstractmethod
     def prepare_store(
         self, block_hashes: Iterable[BlockHash]
-    ) -> Optional[PrepareStoreOutput]:
+    ) -> PrepareStoreOutput | None:
         """
         Prepare the given blocks to be offloaded.
         The given blocks will be protected from eviction until
diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py
index 0c1cf64a237cb..250ed5e95af4b 100644
--- a/vllm/v1/kv_offload/cpu.py
+++ b/vllm/v1/kv_offload/cpu.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterator
-from typing import Optional
 
 import torch
 
@@ -29,10 +28,10 @@ class CPUOffloadingSpec(OffloadingSpec):
         self.num_cpu_blocks: int = num_cpu_blocks
 
         # scheduler-side
-        self._manager: Optional[OffloadingManager] = None
+        self._manager: OffloadingManager | None = None
 
         # worker-side
-        self._handler: Optional[OffloadingHandler] = None
+        self._handler: OffloadingHandler | None = None
 
     def get_manager(self) -> OffloadingManager:
         if not self._manager:
diff --git a/vllm/v1/kv_offload/factory.py b/vllm/v1/kv_offload/factory.py
index e0a53460e840d..b4d40cb48e1d1 100644
--- a/vllm/v1/kv_offload/factory.py
+++ b/vllm/v1/kv_offload/factory.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
-from typing import TYPE_CHECKING, Callable
+from collections.abc import Callable
+from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
 from vllm.v1.kv_offload.spec import OffloadingSpec
diff --git a/vllm/v1/kv_offload/lru_manager.py b/vllm/v1/kv_offload/lru_manager.py
index 36f5eb4a0abdd..0a0111f887905 100644
--- a/vllm/v1/kv_offload/lru_manager.py
+++ b/vllm/v1/kv_offload/lru_manager.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import OrderedDict
 from collections.abc import Iterable
-from typing import Optional
 
 from vllm.v1.core.kv_cache_utils import BlockHash
 from vllm.v1.kv_offload.abstract import (
@@ -23,7 +22,7 @@ class LRUOffloadingManager(OffloadingManager):
         self.backend: Backend = backend
         # block_hash -> BlockStatus
         self.blocks: OrderedDict[BlockHash, BlockStatus] = OrderedDict()
-        self.events: Optional[list[OffloadingEvent]] = [] if enable_events else None
+        self.events: list[OffloadingEvent] | None = [] if enable_events else None
 
     def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
         hit_count = 0
@@ -57,7 +56,7 @@ class LRUOffloadingManager(OffloadingManager):
 
     def prepare_store(
         self, block_hashes: Iterable[BlockHash]
-    ) -> Optional[PrepareStoreOutput]:
+    ) -> PrepareStoreOutput | None:
         # filter out blocks that are already stored
         block_hashes_to_store = [
             block_hash for block_hash in block_hashes if block_hash not in self.blocks
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 32d2ed2961dee..8c5abae2ae652 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -4,9 +4,10 @@
 import logging
 import time
 from abc import ABC, abstractmethod
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import TypeAlias
 
-import prometheus_client
+from prometheus_client import Counter, Gauge, Histogram
 
 from vllm.config import SupportsMetricsInfo, VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorLogging
@@ -40,9 +41,9 @@ class StatLoggerBase(ABC):
     @abstractmethod
     def record(
         self,
-        scheduler_stats: Optional[SchedulerStats],
-        iteration_stats: Optional[IterationStats],
-        mm_cache_stats: Optional[MultiModalCacheStats] = None,
+        scheduler_stats: SchedulerStats | None,
+        iteration_stats: IterationStats | None,
+        mm_cache_stats: MultiModalCacheStats | None = None,
         engine_idx: int = 0,
     ): ...
 
@@ -93,9 +94,9 @@ class LoggingStatLogger(StatLoggerBase):
 
     def record(
         self,
-        scheduler_stats: Optional[SchedulerStats],
-        iteration_stats: Optional[IterationStats],
-        mm_cache_stats: Optional[MultiModalCacheStats] = None,
+        scheduler_stats: SchedulerStats | None,
+        iteration_stats: IterationStats | None,
+        mm_cache_stats: MultiModalCacheStats | None = None,
         engine_idx: int = 0,
     ):
         """Log Stats to standard output."""
@@ -178,13 +179,13 @@ class LoggingStatLogger(StatLoggerBase):
 
 
 class PrometheusStatLogger(StatLoggerBase):
-    _gauge_cls = prometheus_client.Gauge
-    _counter_cls = prometheus_client.Counter
-    _histogram_cls = prometheus_client.Histogram
+    _gauge_cls = Gauge
+    _counter_cls = Counter
+    _histogram_cls = Histogram
     _spec_decoding_cls = SpecDecodingProm
 
     def __init__(
-        self, vllm_config: VllmConfig, engine_indexes: Optional[list[int]] = None
+        self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None
     ):
         if engine_indexes is None:
             engine_indexes = [0]
@@ -368,9 +369,7 @@ class PrometheusStatLogger(StatLoggerBase):
             counter_generation_tokens, engine_indexes, model_name
         )
 
-        self.counter_request_success: dict[
-            FinishReason, dict[int, prometheus_client.Counter]
-        ] = {}
+        self.counter_request_success: dict[FinishReason, dict[int, Counter]] = {}
         counter_request_success_base = self._counter_cls(
             name="vllm:request_success",
             documentation="Count of successfully processed requests.",
@@ -660,7 +659,7 @@ class PrometheusStatLogger(StatLoggerBase):
 
         # TODO: This metric might be incorrect in case of using multiple
         # api_server counts which uses prometheus mp.
-        self.gauge_lora_info: Optional[prometheus_client.Gauge] = None
+        self.gauge_lora_info: Gauge | None = None
         if vllm_config.lora_config is not None:
             if len(self.engine_indexes) > 1:
                 raise NotImplementedError("LoRA in DP mode is not supported yet.")
@@ -705,9 +704,9 @@ class PrometheusStatLogger(StatLoggerBase):
 
     def record(
         self,
-        scheduler_stats: Optional[SchedulerStats],
-        iteration_stats: Optional[IterationStats],
-        mm_cache_stats: Optional[MultiModalCacheStats] = None,
+        scheduler_stats: SchedulerStats | None,
+        iteration_stats: IterationStats | None,
+        mm_cache_stats: MultiModalCacheStats | None = None,
         engine_idx: int = 0,
     ):
         """Log to prometheus."""
@@ -826,11 +825,7 @@ class PrometheusStatLogger(StatLoggerBase):
         self.log_metrics_info("cache_config", self.vllm_config.cache_config)
 
 
-PromMetric = Union[
-    prometheus_client.Gauge,
-    prometheus_client.Counter,
-    prometheus_client.Histogram,
-]
+PromMetric: TypeAlias = Gauge | Counter | Histogram
 
 
 def make_per_engine(
@@ -882,8 +877,8 @@ class StatLoggerManager:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        engine_idxs: Optional[list[int]] = None,
-        custom_stat_loggers: Optional[list[StatLoggerFactory]] = None,
+        engine_idxs: list[int] | None = None,
+        custom_stat_loggers: list[StatLoggerFactory] | None = None,
         enable_default_loggers: bool = True,
         client_count: int = 1,
     ):
@@ -924,10 +919,10 @@ class StatLoggerManager:
 
     def record(
         self,
-        scheduler_stats: Optional[SchedulerStats],
-        iteration_stats: Optional[IterationStats],
-        mm_cache_stats: Optional[MultiModalCacheStats] = None,
-        engine_idx: Optional[int] = None,
+        scheduler_stats: SchedulerStats | None,
+        iteration_stats: IterationStats | None,
+        mm_cache_stats: MultiModalCacheStats | None = None,
+        engine_idx: int | None = None,
     ):
         if engine_idx is None:
             engine_idx = 0
diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py
index 5823737968f9a..1eacb785aa843 100644
--- a/vllm/v1/metrics/prometheus.py
+++ b/vllm/v1/metrics/prometheus.py
@@ -3,7 +3,6 @@
 
 import os
 import tempfile
-from typing import Optional
 
 from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
 
@@ -12,7 +11,7 @@ from vllm.logger import init_logger
 logger = init_logger(__name__)
 
 # Global temporary directory for prometheus multiprocessing
-_prometheus_multiproc_dir: Optional[tempfile.TemporaryDirectory] = None
+_prometheus_multiproc_dir: tempfile.TemporaryDirectory | None = None
 
 
 def setup_multiprocess_prometheus():
diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py
index a6fe2062f70cf..b845852a0c0d5 100644
--- a/vllm/v1/metrics/ray_wrappers.py
+++ b/vllm/v1/metrics/ray_wrappers.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
-from typing import Optional, Union
 
 from vllm.v1.metrics.loggers import PrometheusStatLogger
 from vllm.v1.spec_decode.metrics import SpecDecodingProm
@@ -63,9 +62,9 @@ class RayGaugeWrapper(RayPrometheusMetric):
     def __init__(
         self,
         name: str,
-        documentation: Optional[str] = "",
-        labelnames: Optional[list[str]] = None,
-        multiprocess_mode: Optional[str] = "",
+        documentation: str | None = "",
+        labelnames: list[str] | None = None,
+        multiprocess_mode: str | None = "",
     ):
         # All Ray metrics are keyed by WorkerId, so multiprocess modes like
         # "mostrecent", "all", "sum" do not apply. This logic can be manually
@@ -77,7 +76,7 @@ class RayGaugeWrapper(RayPrometheusMetric):
             name=name, description=documentation, tag_keys=labelnames_tuple
         )
 
-    def set(self, value: Union[int, float]):
+    def set(self, value: int | float):
         return self.metric.set(value)
 
     def set_to_current_time(self):
@@ -92,8 +91,8 @@ class RayCounterWrapper(RayPrometheusMetric):
     def __init__(
         self,
         name: str,
-        documentation: Optional[str] = "",
-        labelnames: Optional[list[str]] = None,
+        documentation: str | None = "",
+        labelnames: list[str] | None = None,
     ):
         labelnames_tuple = tuple(labelnames) if labelnames else None
         name = self._get_sanitized_opentelemetry_name(name)
@@ -101,7 +100,7 @@ class RayCounterWrapper(RayPrometheusMetric):
             name=name, description=documentation, tag_keys=labelnames_tuple
         )
 
-    def inc(self, value: Union[int, float] = 1.0):
+    def inc(self, value: int | float = 1.0):
         if value == 0:
             return
         return self.metric.inc(value)
@@ -114,9 +113,9 @@ class RayHistogramWrapper(RayPrometheusMetric):
     def __init__(
         self,
         name: str,
-        documentation: Optional[str] = "",
-        labelnames: Optional[list[str]] = None,
-        buckets: Optional[list[float]] = None,
+        documentation: str | None = "",
+        labelnames: list[str] | None = None,
+        buckets: list[float] | None = None,
     ):
         labelnames_tuple = tuple(labelnames) if labelnames else None
         name = self._get_sanitized_opentelemetry_name(name)
@@ -128,7 +127,7 @@ class RayHistogramWrapper(RayPrometheusMetric):
             boundaries=boundaries,
         )
 
-    def observe(self, value: Union[int, float]):
+    def observe(self, value: int | float):
         return self.metric.observe(value)
 
 
diff --git a/vllm/v1/metrics/reader.py b/vllm/v1/metrics/reader.py
index 5d50fa9461d0c..48c88e5b61cb9 100644
--- a/vllm/v1/metrics/reader.py
+++ b/vllm/v1/metrics/reader.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Optional
 
 from prometheus_client import REGISTRY
 from prometheus_client import Metric as PromMetric
@@ -144,7 +143,7 @@ def get_metrics_snapshot() -> list[Metric]:
     return collected
 
 
-def _get_samples(metric: PromMetric, suffix: Optional[str] = None) -> list[Sample]:
+def _get_samples(metric: PromMetric, suffix: str | None = None) -> list[Sample]:
     name = (metric.name + suffix) if suffix is not None else metric.name
     return [s for s in metric.samples if s.name == name]
 
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 8d21efca87f44..a4a8ab32ad720 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -4,7 +4,7 @@
 import time
 from collections import deque
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 
@@ -152,8 +152,8 @@ class SchedulerStats:
 
     prefix_cache_stats: PrefixCacheStats = field(default_factory=PrefixCacheStats)
 
-    spec_decoding_stats: Optional[SpecDecodingStats] = None
-    kv_connector_stats: Optional[dict[str, Any]] = None
+    spec_decoding_stats: SpecDecodingStats | None = None
+    kv_connector_stats: dict[str, Any] | None = None
 
     num_corrupted_reqs: int = 0
 
@@ -191,7 +191,7 @@ class FinishedRequestStats:
     e2e_latency: float = 0.0
     num_prompt_tokens: int = 0
     num_generation_tokens: int = 0
-    max_tokens_param: Optional[int] = None
+    max_tokens_param: int | None = None
     queued_time: float = 0.0
     prefill_time: float = 0.0
     inference_time: float = 0.0
@@ -230,7 +230,7 @@ class IterationStats:
         is_prefilling: bool,
         prompt_len: int,
         req_stats: RequestStateStats,
-        lora_stats: Optional[LoRAStats],
+        lora_stats: LoRAStats | None,
     ):
         num_new_generation_tokens = len(output.new_token_ids)
 
@@ -265,7 +265,7 @@ class IterationStats:
         events: list["EngineCoreEvent"],
         is_prefilling: bool,
         req_stats: RequestStateStats,
-        lora_stats: Optional[LoRAStats],
+        lora_stats: LoRAStats | None,
     ):
         # Avoid circular dependency
         from vllm.v1.engine import EngineCoreEventType
@@ -287,7 +287,7 @@ class IterationStats:
         self,
         finish_reason: "FinishReason",
         num_prompt_tokens: int,
-        max_tokens_param: Optional[int],
+        max_tokens_param: int | None,
         req_stats: RequestStateStats,
     ):
         e2e_latency = self._time_since(req_stats.arrival_time)
@@ -335,7 +335,7 @@ class LoRARequestStates:
     def __init__(self):
         self.lora_name_to_stats: dict[str, LoRAStats] = {}
 
-    def get_stats(self, req_state: "RequestState") -> Optional[LoRAStats]:
+    def get_stats(self, req_state: "RequestState") -> LoRAStats | None:
         if req_state.lora_name is None:
             return None
         if req_state.lora_name not in self.lora_name_to_stats:
@@ -362,20 +362,20 @@ class LoRARequestStates:
     # Break the pattern for this lifecycle methods so we can
     # call this from IterationStats.update_from_events()
     @staticmethod
-    def scheduled_request(lora_stats: Optional[LoRAStats], request_id: str):
+    def scheduled_request(lora_stats: LoRAStats | None, request_id: str):
         if lora_stats is None:
             return
         lora_stats.waiting_requests.remove(request_id)
         lora_stats.running_requests.add(request_id)
 
     @staticmethod
-    def preempted_request(lora_stats: Optional[LoRAStats], request_id: str):
+    def preempted_request(lora_stats: LoRAStats | None, request_id: str):
         if lora_stats is None:
             return
         lora_stats.running_requests.remove(request_id)
         lora_stats.waiting_requests.add(request_id)
 
-    def update_iteration_stats(self, iteration_stats: Optional[IterationStats]):
+    def update_iteration_stats(self, iteration_stats: IterationStats | None):
         if iteration_stats is None:
             return
         for lora_name, stats in self.lora_name_to_stats.items():
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index d647b207575cf..c224555da6cac 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -3,12 +3,14 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, NamedTuple, Optional, Union
+from typing import TYPE_CHECKING, NamedTuple
 
 import torch
 
 if TYPE_CHECKING:
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
+else:
+    KVConnectorStats = object
 
 
 class LogprobsLists(NamedTuple):
@@ -64,7 +66,7 @@ class LogprobsTensors(NamedTuple):
 
 # [num_reqs, <dynamic>]
 # The shape of each element depends on the pooler used
-PoolerOutput = Union[torch.Tensor, list[torch.Tensor]]
+PoolerOutput = torch.Tensor | list[torch.Tensor]
 
 
 @dataclass
@@ -74,15 +76,15 @@ class SamplerOutput:
     # All requests are padded to max_num_generated_tokens.
     # PLACEHOLDER_TOKEN_ID (-1 by default) is used for padding.
     sampled_token_ids: torch.Tensor
-    logprobs_tensors: Optional[LogprobsTensors]
+    logprobs_tensors: LogprobsTensors | None
 
 
 @dataclass
 class KVConnectorOutput:
     # [req_ids]
-    finished_sending: Optional[set[str]] = None
-    finished_recving: Optional[set[str]] = None
-    kv_connector_stats: Optional["KVConnectorStats"] = None
+    finished_sending: set[str] | None = None
+    finished_recving: set[str] | None = None
+    kv_connector_stats: KVConnectorStats | None = None
     # IDs of externally computed KV blocks that failed to load.
     # Requests referencing these blocks should be rescheduled to recompute them.
     invalid_block_ids: set[int] = field(default_factory=set)
@@ -114,21 +116,21 @@ class ModelRunnerOutput:
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs]
-    logprobs: Optional[LogprobsLists]
+    logprobs: LogprobsLists | None
 
     # req_id -> (token_ids, logprobs, ranks)
     # [prompt_len, num_prompt_logprobs]
     # [prompt_len, num_prompt_logprobs]
     # [prompt_len]
-    prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]]
+    prompt_logprobs_dict: dict[str, LogprobsTensors | None]
 
     # [num_reqs, hidden_size]
-    pooler_output: list[Optional[torch.Tensor]]
+    pooler_output: list[torch.Tensor | None]
 
-    kv_connector_output: Optional[KVConnectorOutput] = None
+    kv_connector_output: KVConnectorOutput | None = None
 
     # req_id -> num_nans_in_logits
-    num_nans_in_logits: Optional[dict[str, int]] = None
+    num_nans_in_logits: dict[str, int] | None = None
 
 
 # ModelRunnerOutput wrapper for async scheduling.
diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py
index 36ae5b40a3138..2fb320dd2aaf8 100644
--- a/vllm/v1/pool/metadata.py
+++ b/vllm/v1/pool/metadata.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -37,9 +36,9 @@ class PoolingMetadata:
     """Tensors for pooling."""
 
     prompt_lens: torch.Tensor  # CPU Tensor
-    prompt_token_ids: Optional[torch.Tensor]
+    prompt_token_ids: torch.Tensor | None
     pooling_params: list[PoolingParams]
-    pooling_cursor: Optional[PoolingCursor] = None
+    pooling_cursor: PoolingCursor | None = None
 
     def __getitem__(self, indices: slice):
         return PoolingMetadata(
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index ac6e583099bc6..5926bf5b46ee9 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -3,9 +3,9 @@
 
 import enum
 import time
-from collections.abc import Mapping
+from collections.abc import Callable, Mapping
 from functools import partial
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 
@@ -31,20 +31,20 @@ class Request:
     def __init__(
         self,
         request_id: str,
-        prompt_token_ids: Optional[list[int]],
-        sampling_params: Optional[SamplingParams],
-        pooling_params: Optional[PoolingParams],
-        eos_token_id: Optional[int],
+        prompt_token_ids: list[int] | None,
+        sampling_params: SamplingParams | None,
+        pooling_params: PoolingParams | None,
+        eos_token_id: int | None,
         client_index: int = 0,
-        arrival_time: Optional[float] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        mm_features: Optional[list[MultiModalFeatureSpec]] = None,
+        arrival_time: float | None = None,
+        prompt_embeds: torch.Tensor | None = None,
+        mm_features: list[MultiModalFeatureSpec] | None = None,
         lora_request: Optional["LoRARequest"] = None,
         structured_output_request: Optional["StructuredOutputRequest"] = None,
-        cache_salt: Optional[str] = None,
+        cache_salt: str | None = None,
         priority: int = 0,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        block_hasher: Optional[Callable[["Request"], list["BlockHash"]]] = None,
+        trace_headers: Mapping[str, str] | None = None,
+        block_hasher: Callable[["Request"], list["BlockHash"]] | None = None,
     ) -> None:
         self.request_id = request_id
         self.client_index = client_index
@@ -60,10 +60,10 @@ class Request:
         self.status = RequestStatus.WAITING
         self.use_structured_output = False
         self.events: list[EngineCoreEvent] = []
-        self.stop_reason: Union[int, str, None] = None
+        self.stop_reason: int | str | None = None
 
         # P/D: Connector-specific KV transfer parameters.
-        self.kv_transfer_params: Optional[dict[str, Any]] = None
+        self.kv_transfer_params: dict[str, Any] | None = None
 
         if pooling_params is not None:
             # Pooling models.
@@ -97,7 +97,7 @@ class Request:
         self.num_output_placeholders = 0  # Used in async scheduling.
         self.spec_token_ids: list[int] = []
         self.num_computed_tokens = 0
-        self.cache_salt: Optional[str] = cache_salt
+        self.cache_salt: str | None = cache_salt
 
         # Multi-modal related
         self.mm_features = mm_features or []
@@ -123,7 +123,7 @@ class Request:
         self.num_preemptions = 0
 
         self.block_hashes: list[BlockHash] = []
-        self.get_hash_new_full_blocks: Optional[Callable[[], list[BlockHash]]] = None
+        self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None
         if block_hasher is not None:
             self.get_hash_new_full_blocks = partial(block_hasher, self)
             self.block_hashes = self.get_hash_new_full_blocks()
@@ -132,7 +132,7 @@ class Request:
     def from_engine_core_request(
         cls,
         request: EngineCoreRequest,
-        block_hasher: Optional[Callable[["Request"], list["BlockHash"]]],
+        block_hasher: Callable[["Request"], list["BlockHash"]] | None,
     ) -> "Request":
         return cls(
             request_id=request.request_id,
@@ -158,7 +158,7 @@ class Request:
 
     def append_output_token_ids(
         self,
-        token_ids: Union[int, list[int]],
+        token_ids: int | list[int],
     ) -> None:
         if isinstance(token_ids, int):
             self._output_token_ids.append(token_ids)
@@ -189,7 +189,7 @@ class Request:
     def is_finished(self) -> bool:
         return RequestStatus.is_finished(self.status)
 
-    def get_finished_reason(self) -> Union[FinishReason, None]:
+    def get_finished_reason(self) -> FinishReason | None:
         return RequestStatus.get_finished_reason(self.status)
 
     def get_num_encoder_tokens(self, input_id: int) -> int:
@@ -200,11 +200,11 @@ class Request:
     def record_event(
         self,
         event_type: EngineCoreEventType,
-        timestamp: Optional[float] = None,
+        timestamp: float | None = None,
     ) -> None:
         self.events.append(EngineCoreEvent.new_event(event_type, timestamp))
 
-    def take_events(self) -> Optional[list[EngineCoreEvent]]:
+    def take_events(self) -> list[EngineCoreEvent] | None:
         if not self.events:
             return None
         events, self.events = self.events, []
@@ -234,7 +234,7 @@ class RequestStatus(enum.IntEnum):
         return status > RequestStatus.PREEMPTED
 
     @staticmethod
-    def get_finished_reason(status: "RequestStatus") -> Union[FinishReason, None]:
+    def get_finished_reason(status: "RequestStatus") -> FinishReason | None:
         return _FINISHED_REASON_MAP.get(status)
 
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 06b9e4b12d7b6..566de5bcda772 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -6,7 +6,7 @@ import itertools
 from abc import abstractmethod
 from collections.abc import Sequence
 from functools import partial
-from typing import TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING
 
 import torch
 
@@ -81,7 +81,7 @@ def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]:
 
 
 def _load_logitsprocs_by_fqcns(
-    logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]],
+    logits_processors: Sequence[str | type[LogitsProcessor]] | None,
 ) -> list[type[LogitsProcessor]]:
     """Load logit processor types, identifying them by fully-qualified class
     names (FQCNs).
@@ -146,7 +146,7 @@ def _load_logitsprocs_by_fqcns(
 
 
 def _load_custom_logitsprocs(
-    logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]],
+    logits_processors: Sequence[str | type[LogitsProcessor]] | None,
 ) -> list[type[LogitsProcessor]]:
     """Load all custom logits processors.
 
@@ -176,7 +176,7 @@ def build_logitsprocs(
     device: torch.device,
     is_pin_memory: bool,
     is_pooling_model: bool,
-    custom_logitsprocs: Sequence[Union[str, type[LogitsProcessor]]] = (),
+    custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = (),
 ) -> LogitsProcessors:
     if is_pooling_model:
         if custom_logitsprocs:
@@ -249,7 +249,7 @@ class AdapterLogitsProcessor(LogitsProcessor):
     def new_req_logits_processor(
         self,
         params: SamplingParams,
-    ) -> Optional[RequestLogitsProcessor]:
+    ) -> RequestLogitsProcessor | None:
         """Consume request info; return a per-request logits processor.
 
         Return None if logits processor does not need to be applied to request
@@ -267,9 +267,9 @@ class AdapterLogitsProcessor(LogitsProcessor):
     def _new_state(
         self,
         params: SamplingParams,
-        prompt_ids: Optional[list[int]],
+        prompt_ids: list[int] | None,
         output_ids: list[int],
-    ) -> Optional[partial[torch.Tensor]]:
+    ) -> partial[torch.Tensor] | None:
         """Return state representation for new request
 
         Returns None if logits processor is not applicable to request
@@ -292,7 +292,7 @@ class AdapterLogitsProcessor(LogitsProcessor):
             return partial(req_lp, *args)
         return None
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
         process_dict_updates(
             self.req_info,
             batch_update,
diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index 3c3ddda7fb3e4..4ee7dc2880c8c 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Sequence
-from typing import TYPE_CHECKING, Callable, Optional, TypeVar
+from collections.abc import Callable, Sequence
+from typing import TYPE_CHECKING, TypeVar
 
 import torch
 
@@ -49,7 +49,7 @@ class MinPLogitsProcessor(LogitsProcessor):
     def get_min_p_by_index(self, index: int) -> float:
         return float(self.min_p_cpu[index])
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
         if not batch_update:
             return
 
@@ -131,7 +131,7 @@ class LogitBiasLogitsProcessor(LogitsProcessor):
         outcome of argmax in greedy sampling."""
         return False
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
         needs_update = process_dict_updates(
             self.biases, batch_update, lambda params, _, __: params.logit_bias or None
         )
@@ -185,14 +185,14 @@ class MinTokensLogitsProcessor(LogitsProcessor):
 
     @staticmethod
     def add_request(
-        params: SamplingParams, _: Optional[list[int]], output_tok_ids: list[int]
-    ) -> Optional[tuple[int, Sequence[int], set[int]]]:
+        params: SamplingParams, _: list[int] | None, output_tok_ids: list[int]
+    ) -> tuple[int, Sequence[int], set[int]] | None:
         min_tokens = params.min_tokens
         if not min_tokens or len(output_tok_ids) >= min_tokens:
             return None
         return min_tokens, output_tok_ids, params.all_stop_token_ids
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
+    def update_state(self, batch_update: BatchUpdate | None):
         needs_update = process_dict_updates(
             self.min_toks, batch_update, self.add_request
         )
@@ -235,8 +235,8 @@ class MinTokensLogitsProcessor(LogitsProcessor):
 
 def process_dict_updates(
     req_entries: dict[int, T],
-    batch_update: Optional[BatchUpdate],
-    new_state: Callable[[SamplingParams, Optional[list[int]], list[int]], Optional[T]],
+    batch_update: BatchUpdate | None,
+    new_state: Callable[[SamplingParams, list[int] | None, list[int]], T | None],
 ) -> bool:
     """Utility function to update dict state for sparse LogitsProcessors."""
 
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
index 713bd21d38554..efa0f62ad6e1d 100644
--- a/vllm/v1/sample/logits_processor/interface.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -26,7 +26,7 @@ RemovedRequest = int
 
 # (index, params, prompt_tok_ids, output_tok_ids) tuples for new
 # requests added to the batch.
-AddedRequest = tuple[int, SamplingParams, Optional[list[int]], list[int]]
+AddedRequest = tuple[int, SamplingParams, list[int] | None, list[int]]
 
 # (index 1, index 2, directionality) tuples representing
 # one-way moves or two-way swaps of requests in batch
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
index a601f66415818..c15219da5cf79 100644
--- a/vllm/v1/sample/logits_processor/state.py
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterator
 from itertools import chain
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 from vllm.v1.sample.logits_processor.interface import (
     AddedRequest,
@@ -43,9 +43,9 @@ class BatchUpdateBuilder:
 
     def __init__(
         self,
-        removed: Optional[list[RemovedRequest]] = None,
-        added: Optional[list[AddedRequest]] = None,
-        moved: Optional[list[MovedRequest]] = None,
+        removed: list[RemovedRequest] | None = None,
+        added: list[AddedRequest] | None = None,
+        moved: list[MovedRequest] | None = None,
     ) -> None:
         self._removed = removed or []
         self.added = added or []
@@ -92,14 +92,14 @@ class BatchUpdateBuilder:
     def has_removed(self) -> bool:
         return bool(self._removed)
 
-    def peek_removed(self) -> Optional[int]:
+    def peek_removed(self) -> int | None:
         """Return lowest removed request index"""
         if self.has_removed():
             self._ensure_removed_sorted()
             return self._removed[-1]
         return None
 
-    def pop_removed(self) -> Optional[int]:
+    def pop_removed(self) -> int | None:
         """Pop lowest removed request index"""
         if self.has_removed():
             self._ensure_removed_sorted()
@@ -116,7 +116,7 @@ class BatchUpdateBuilder:
         self.batch_changed = False
         return batch_changed
 
-    def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
+    def get_and_reset(self, batch_size: int) -> BatchUpdate | None:
         """Generate a logitsprocs batch update data structure and reset
         internal batch update builder state.
 
@@ -148,9 +148,7 @@ class BatchUpdateBuilder:
 class LogitsProcessors:
     """Encapsulates initialized logitsproc objects."""
 
-    def __init__(
-        self, logitsprocs: Optional[Iterator["LogitsProcessor"]] = None
-    ) -> None:
+    def __init__(self, logitsprocs: Iterator["LogitsProcessor"] | None = None) -> None:
         self.argmax_invariant: list[LogitsProcessor] = []
         self.non_argmax_invariant: list[LogitsProcessor] = []
         if logitsprocs:
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index e252ace97d27e..b1101b1b23187 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -11,20 +10,20 @@ from vllm.v1.sample.logits_processor import LogitsProcessors
 
 @dataclass
 class SamplingMetadata:
-    temperature: Optional[torch.Tensor]
+    temperature: torch.Tensor | None
     all_greedy: bool
     all_random: bool
 
-    top_p: Optional[torch.Tensor]
-    top_k: Optional[torch.Tensor]
+    top_p: torch.Tensor | None
+    top_k: torch.Tensor | None
 
     generators: dict[int, torch.Generator]
 
     # None means no logprobs, 0 means sampled token logprobs only
-    max_num_logprobs: Optional[int]
+    max_num_logprobs: int | None
 
     no_penalties: bool
-    prompt_token_ids: Optional[torch.Tensor]
+    prompt_token_ids: torch.Tensor | None
     frequency_penalties: torch.Tensor
     presence_penalties: torch.Tensor
     repetition_penalties: torch.Tensor
@@ -33,7 +32,7 @@ class SamplingMetadata:
 
     # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
     # vocab size).
-    allowed_token_ids_mask: Optional[torch.Tensor]
+    allowed_token_ids_mask: torch.Tensor | None
 
     # req_index -> bad_words_token_ids
     bad_words_token_ids: dict[int, list[list[int]]]
@@ -42,4 +41,4 @@ class SamplingMetadata:
     logitsprocs: LogitsProcessors
 
     # Speculative token ids
-    spec_token_ids: Optional[list[list[int]]] = None
+    spec_token_ids: list[list[int]] | None = None
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 5fa7a9ad44cd4..f3322dc8a4ce6 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -84,9 +83,9 @@ class TopKTopPSampler(nn.Module):
         self,
         logits: torch.Tensor,
         generators: dict[int, torch.Generator],
-        k: Optional[torch.Tensor],
-        p: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        k: torch.Tensor | None,
+        p: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """
         PyTorch-native implementation of top-k and top-p sampling.
 
@@ -105,9 +104,9 @@ class TopKTopPSampler(nn.Module):
         self,
         logits: torch.Tensor,
         generators: dict[int, torch.Generator],
-        k: Optional[torch.Tensor],
-        p: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        k: torch.Tensor | None,
+        p: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """More optimized implementation for top-k and top-p sampling."""
         # We prefer `random_sample` over `flashinfer_sample` when sorting is
         # not needed. This is because `random_sample` does not require
@@ -132,9 +131,9 @@ class TopKTopPSampler(nn.Module):
         self,
         logits: torch.Tensor,
         generators: dict[int, torch.Generator],
-        k: Optional[torch.Tensor],
-        p: Optional[torch.Tensor],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        k: torch.Tensor | None,
+        p: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """
         PyTorch-native implementation of top-k and top-p sampling for CPU.
 
@@ -170,8 +169,8 @@ class TopKTopPSampler(nn.Module):
 
 def apply_top_k_top_p(
     logits: torch.Tensor,
-    k: Optional[torch.Tensor],
-    p: Optional[torch.Tensor],
+    k: torch.Tensor | None,
+    p: torch.Tensor | None,
 ) -> torch.Tensor:
     """Apply top-k and top-p masks to the logits.
 
@@ -262,8 +261,8 @@ def random_sample(
 
 def flashinfer_sample(
     logits: torch.Tensor,
-    k: Optional[torch.Tensor],
-    p: Optional[torch.Tensor],
+    k: torch.Tensor | None,
+    p: torch.Tensor | None,
     generators: dict[int, torch.Generator],
 ) -> torch.Tensor:
     """Sample from the logits using FlashInfer.
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 72cee8c73969a..f5b075e83b842 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -49,7 +48,7 @@ class RejectionSampler(nn.Module):
         self,
         metadata: SpecDecodeMetadata,
         # [num_tokens, vocab_size]
-        draft_probs: Optional[torch.Tensor],
+        draft_probs: torch.Tensor | None,
         # [num_tokens, vocab_size]
         target_logits: torch.Tensor,
         # [batch_size, 1]
@@ -216,7 +215,7 @@ class RejectionSampler(nn.Module):
     @staticmethod
     def _combine_outputs_with_spec_tokens(
         output_token_ids: list[list[int]],
-        spec_token_ids: Optional[list[list[int]]] = None,
+        spec_token_ids: list[list[int]] | None = None,
     ) -> list[list[int]]:
         if spec_token_ids is None:
             return output_token_ids
@@ -240,7 +239,7 @@ def rejection_sample(
     # [batch_size]
     cu_num_draft_tokens: torch.Tensor,
     # [num_tokens, vocab_size]
-    draft_probs: Optional[torch.Tensor],
+    draft_probs: torch.Tensor | None,
     # [num_tokens, vocab_size]
     target_probs: torch.Tensor,
     # [batch_size, 1]
@@ -493,7 +492,7 @@ def sample_recovered_tokens(
     # [num_tokens]
     draft_token_ids: torch.Tensor,
     # [num_tokens, vocab_size]
-    draft_probs: Optional[torch.Tensor],
+    draft_probs: torch.Tensor | None,
     # [num_tokens, vocab_size]
     target_probs: torch.Tensor,
     sampling_metadata: SamplingMetadata,
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 2e076ca8e3c84..5eadc3161f89c 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A layer that samples the next tokens from the model's outputs."""
 
-from typing import Optional
-
 import torch
 import torch.nn as nn
 
@@ -140,7 +138,7 @@ class Sampler(nn.Module):
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         """Sample logits based on sampling metadata.
 
         The various logits processing functions called in this method
@@ -243,7 +241,7 @@ class Sampler(nn.Module):
     @staticmethod
     def _combine_outputs_with_spec_tokens(
         output_token_ids: list[list[int]],
-        spec_token_ids: Optional[list[list[int]]] = None,
+        spec_token_ids: list[list[int]] | None = None,
     ) -> list[list[int]]:
         if spec_token_ids is None:
             return output_token_ids
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index b58a94d0bf7dc..c4bc88e615bd9 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass, field
-from typing import Optional
 
 import torch
 
@@ -48,7 +47,7 @@ class TPUSupportedSamplingMetadata:
 
     min_tokens = None  # impl is not vectorized
 
-    logit_bias: list[Optional[dict[int, float]]] = field(default_factory=lambda: list())
+    logit_bias: list[dict[int, float] | None] = field(default_factory=lambda: list())
 
     allowed_token_ids_mask = None
     bad_words_token_ids = None
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index ccef283a81829..f81f3a0eefef3 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampler layer implementing TPU supported operations."""
 
-from typing import Optional
-
 import torch
 import torch.nn as nn
 
@@ -166,8 +164,8 @@ class Sampler(nn.Module):
 
 def apply_top_k_top_p(
     logits: torch.Tensor,
-    k: Optional[torch.Tensor],
-    p: Optional[torch.Tensor],
+    k: torch.Tensor | None,
+    p: torch.Tensor | None,
 ) -> torch.Tensor:
     """
     Apply top-k and top-p optimized for TPU.
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index f4e1cbd2e0243..528c9671dbfdb 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -4,10 +4,10 @@
 import dataclasses
 import importlib
 import pickle
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from inspect import isclass
 from types import FunctionType
-from typing import Any, Callable, Optional, Union
+from typing import Any, TypeAlias
 
 import cloudpickle
 import msgspec
@@ -47,7 +47,7 @@ MMF_CLASS_TO_FACTORY: dict[type[BaseMultiModalField], str] = {
     MultiModalBatchedField: "batched",
 }
 
-bytestr = Union[bytes, bytearray, memoryview, zmq.Frame]
+bytestr: TypeAlias = bytes | bytearray | memoryview | zmq.Frame
 
 
 def _log_insecure_serialization_warning():
@@ -57,7 +57,7 @@ def _log_insecure_serialization_warning():
     )
 
 
-def _typestr(val: Any) -> Optional[tuple[str, str]]:
+def _typestr(val: Any) -> tuple[str, str] | None:
     if val is None:
         return None
     t = type(val)
@@ -111,14 +111,14 @@ class MsgpackEncoder:
     via dedicated messages. Note that this is a per-tensor limit.
     """
 
-    def __init__(self, size_threshold: Optional[int] = None):
+    def __init__(self, size_threshold: int | None = None):
         if size_threshold is None:
             size_threshold = envs.VLLM_MSGPACK_ZERO_COPY_THRESHOLD
         self.encoder = msgpack.Encoder(enc_hook=self.enc_hook)
         # This is used as a local stash of buffers that we can then access from
         # our custom `msgspec` hook, `enc_hook`. We don't have a way to
         # pass custom data to the hook otherwise.
-        self.aux_buffers: Optional[list[bytestr]] = None
+        self.aux_buffers: list[bytestr] | None = None
         self.size_threshold = size_threshold
         if envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
             _log_insecure_serialization_warning()
@@ -195,7 +195,7 @@ class MsgpackEncoder:
 
     def _encode_ndarray(
         self, obj: np.ndarray
-    ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
+    ) -> tuple[str, tuple[int, ...], int | memoryview]:
         assert self.aux_buffers is not None
         # If the array is non-contiguous, we need to copy it first
         arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes()
@@ -215,7 +215,7 @@ class MsgpackEncoder:
 
     def _encode_tensor(
         self, obj: torch.Tensor
-    ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
+    ) -> tuple[str, tuple[int, ...], int | memoryview]:
         assert self.aux_buffers is not None
         # view the tensor as a contiguous 1D array of bytes
         arr = obj.flatten().contiguous().view(torch.uint8).numpy()
@@ -280,7 +280,7 @@ class MsgpackDecoder:
     not thread-safe when encoding tensors / numpy arrays.
     """
 
-    def __init__(self, t: Optional[Any] = None):
+    def __init__(self, t: Any | None = None):
         args = () if t is None else (t,)
         self.decoder = msgpack.Decoder(
             *args, ext_hook=self.ext_hook, dec_hook=self.dec_hook
@@ -289,7 +289,7 @@ class MsgpackDecoder:
         if envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
             _log_insecure_serialization_warning()
 
-    def decode(self, bufs: Union[bytestr, Sequence[bytestr]]) -> Any:
+    def decode(self, bufs: bytestr | Sequence[bytestr]) -> Any:
         if isinstance(bufs, bytestr):  # type: ignore
             return self.decoder.decode(bufs)
 
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 393a4d964ee3e..ad504da55fd8c 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -3,7 +3,6 @@
 import ast
 from dataclasses import replace
 from importlib.util import find_spec
-from typing import Optional
 
 import numpy as np
 import torch
@@ -79,8 +78,8 @@ class EagleProposer:
             vllm_config.model_config
         )
 
-        self.attn_metadata_builder: Optional[AttentionMetadataBuilder] = None
-        self.draft_indexer_metadata_builder: Optional[AttentionMetadataBuilder] = None
+        self.attn_metadata_builder: AttentionMetadataBuilder | None = None
+        self.draft_indexer_metadata_builder: AttentionMetadataBuilder | None = None
         self.attn_layer_names: list[str] = []
         self.indexer_layer_names: list[str] = []
 
@@ -149,7 +148,7 @@ class EagleProposer:
         )
 
         # Determine allowed attention backends once during initialization.
-        self.allowed_attn_types: Optional[tuple] = None
+        self.allowed_attn_types: tuple | None = None
         if current_platform.is_rocm():
             rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
             # vllm.v1.attention.backends.rocm_aiter_fa is an optional backend
@@ -207,10 +206,10 @@ class EagleProposer:
         target_hidden_states: torch.Tensor,
         # [batch_size]
         next_token_ids: torch.Tensor,
-        last_token_indices: Optional[torch.Tensor],
+        last_token_indices: torch.Tensor | None,
         common_attn_metadata: CommonAttentionMetadata,
         sampling_metadata: SamplingMetadata,
-        mm_embed_inputs: Optional[tuple[list[torch.Tensor], torch.Tensor]] = None,
+        mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None,
     ) -> torch.Tensor:
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 89a8a11a3d560..79d856a143ba9 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -3,7 +3,6 @@
 
 import time
 from dataclasses import dataclass, field
-from typing import Optional
 
 import numpy as np
 import prometheus_client
@@ -143,7 +142,7 @@ class SpecDecodingProm:
 
     def __init__(
         self,
-        speculative_config: Optional[SpeculativeConfig],
+        speculative_config: SpeculativeConfig | None,
         labelnames: list[str],
         per_engine_labelvalues: dict[int, list[str]],
     ):
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 1b5e75313d89d..336a0eb98682a 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import multiprocessing
 from concurrent.futures import Future, ThreadPoolExecutor
 from typing import TYPE_CHECKING
@@ -28,6 +26,9 @@ if TYPE_CHECKING:
 else:
     torch = LazyLoader("torch", globals(), "torch")
 
+    ReasoningParser = object
+    Request = object
+
 logger = init_logger(__name__)
 
 
@@ -168,7 +169,7 @@ class StructuredOutputManager:
         requests: dict[str, Request],
         structured_output_request_ids: dict[str, int],
         scheduled_spec_decode_tokens: dict[str, list[int]],
-    ) -> npt.NDArray[np.int32] | None:
+    ) -> "npt.NDArray[np.int32] | None":
         # Prepare the structured output bitmask for this batch.
         if not structured_output_request_ids:
             return None
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 081cdfdc9932b..c37193e667aab 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import copy
 import json
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Union
+from typing import TYPE_CHECKING, Any
 
 import torch
 
@@ -47,7 +45,7 @@ def _walk_json_for_additional_properties(data: object):
 
 
 def process_for_additional_properties(
-    guide_json: Union[str, dict[str, Any]],
+    guide_json: str | dict[str, Any],
 ) -> dict[str, Any]:
     if isinstance(guide_json, str):
         guide_json_obj = json.loads(guide_json)
@@ -184,12 +182,12 @@ class GuidanceGrammar(StructuredOutputGrammar):
 
 def serialize_guidance_grammar(
     request_type: StructuredOutputOptions,
-    grammar_spec: Union[str, dict[str, Any]],
+    grammar_spec: str | dict[str, Any],
     disable_any_whitespace: bool = False,
     disable_additional_properties: bool = False,
 ) -> str:
     def _process_schema(
-        grammar_spec: Union[str, dict[str, Any]],
+        grammar_spec: str | dict[str, Any],
     ) -> str:
         if disable_additional_properties:
             grammar_spec = process_for_additional_properties(grammar_spec)
diff --git a/vllm/v1/structured_output/backend_lm_format_enforcer.py b/vllm/v1/structured_output/backend_lm_format_enforcer.py
index d9e484092d6ab..c20e976d84876 100644
--- a/vllm/v1/structured_output/backend_lm_format_enforcer.py
+++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import ast
 import json
 from dataclasses import dataclass, field
@@ -34,7 +32,7 @@ else:
 @lru_cache
 def _cached_build_vllm_token_enforcer_tokenizer_data(
     tokenizer: PreTrainedTokenizerBase, vocab_size: int
-) -> lmfe_vllm.TokenEnforcerTokenizerData:
+) -> "lmfe_vllm.TokenEnforcerTokenizerData":
     return lmfe_vllm.build_vllm_token_enforcer_tokenizer_data(
         tokenizer, use_bitmask=True, vocab_size=vocab_size
     )
diff --git a/vllm/v1/structured_output/backend_outlines.py b/vllm/v1/structured_output/backend_outlines.py
index c9875337179ef..2355f8ab8f893 100644
--- a/vllm/v1/structured_output/backend_outlines.py
+++ b/vllm/v1/structured_output/backend_outlines.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright 2025-present the Outlines developers
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import ast
 import importlib
 import json
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index 2051b336e5bf1..7dc9589b63b86 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import enum
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -13,6 +11,9 @@ if TYPE_CHECKING:
 
     from vllm.config import VllmConfig
     from vllm.transformers_utils.tokenizer import AnyTokenizer
+else:
+    VllmConfig = object
+    AnyTokenizer = object
 
 
 class StructuredOutputOptions(enum.Enum):
@@ -69,7 +70,7 @@ class StructuredOutputGrammar(ABC):
         """
 
     @abstractmethod
-    def fill_bitmask(self, bitmask: torch.Tensor, batch_index: int) -> None:
+    def fill_bitmask(self, bitmask: "torch.Tensor", batch_index: int) -> None:
         """
         Fills the bitmask for a specific batch index.
 
@@ -119,7 +120,7 @@ class StructuredOutputBackend(ABC):
         """
 
     @abstractmethod
-    def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor:
+    def allocate_token_bitmask(self, max_num_seqs: int) -> "torch.Tensor":
         """
         Allocates a token bitmask for the specified maximum number of sequences.
 
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 4b21b2591c589..1b430157560c0 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import json
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index 233c7c1e7805d..9e149b186c639 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from __future__ import annotations
-
 import dataclasses
 import functools
 import json
 from concurrent.futures import Future
 from concurrent.futures._base import TimeoutError
-from typing import Optional, Union, cast
+from typing import cast
 
 from vllm.sampling_params import SamplingParams
 from vllm.v1.structured_output.backend_types import (
@@ -20,9 +18,7 @@ from vllm.v1.structured_output.backend_types import (
 @dataclasses.dataclass
 class StructuredOutputRequest:
     sampling_params: SamplingParams
-    _grammar: Union[Future[StructuredOutputGrammar], StructuredOutputGrammar] | None = (
-        None
-    )
+    _grammar: Future[StructuredOutputGrammar] | StructuredOutputGrammar | None = None
     reasoning_ended: bool | None = None
 
     def _check_grammar_completion(self) -> bool:
@@ -46,14 +42,12 @@ class StructuredOutputRequest:
     def grammar(self) -> StructuredOutputGrammar | None:
         completed = self._check_grammar_completion()
         return (
-            cast(Optional[StructuredOutputGrammar], self._grammar)
-            if completed
-            else None
+            cast(StructuredOutputGrammar | None, self._grammar) if completed else None
         )
 
     @grammar.setter
     def grammar(
-        self, grammar: Union[StructuredOutputGrammar, Future[StructuredOutputGrammar]]
+        self, grammar: StructuredOutputGrammar | Future[StructuredOutputGrammar]
     ) -> None:
         self._grammar = grammar
 
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index b7326847d016d..2520dc217c798 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -1,8 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
 import hashlib
 import importlib.metadata
 import os
@@ -37,6 +34,10 @@ else:
         "transformers.models.gpt2.tokenization_gpt2",
     )
 
+    AnyTokenizer = object
+    SchedulerOutput = object
+    InputBatch = object
+
 logger = init_logger(__name__)
 
 CACHE = None
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 9259432628949..f03efe21098bf 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -5,14 +5,13 @@ import contextlib
 import multiprocessing
 import time
 import weakref
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from contextlib import AbstractContextManager
 from multiprocessing import connection
 from multiprocessing.process import BaseProcess
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     Generic,
     Optional,
     TypeVar,
@@ -66,7 +65,7 @@ class ConstantList(Generic[T], Sequence):
     def clear(self):
         raise TypeError("Cannot clear a constant list")
 
-    def index(self, item: T, start: int = 0, stop: Optional[int] = None) -> int:
+    def index(self, item: T, start: int = 0, stop: int | None = None) -> int:
         return self._x.index(item, start, stop if stop is not None else len(self._x))
 
     @overload
@@ -75,7 +74,7 @@ class ConstantList(Generic[T], Sequence):
     @overload
     def __getitem__(self, s: slice, /) -> list[T]: ...
 
-    def __getitem__(self, item: Union[int, slice]) -> Union[T, list[T]]:
+    def __getitem__(self, item: int | slice) -> T | list[T]:
         return self._x[item]
 
     @overload
@@ -84,7 +83,7 @@ class ConstantList(Generic[T], Sequence):
     @overload
     def __setitem__(self, s: slice, value: T, /): ...
 
-    def __setitem__(self, item: Union[int, slice], value: Union[T, list[T]]):
+    def __setitem__(self, item: int | slice, value: T | list[T]):
         raise TypeError("Cannot set item in a constant list")
 
     def __delitem__(self, item):
@@ -108,7 +107,7 @@ class CpuGpuBuffer:
 
     def __init__(
         self,
-        *size: Union[int, torch.SymInt],
+        *size: int | torch.SymInt,
         dtype: torch.dtype,
         device: torch.device,
         pin_memory: bool,
@@ -128,12 +127,12 @@ class CpuGpuBuffer:
                 )
             self.np = self.cpu.numpy()
 
-    def copy_to_gpu(self, n: Optional[int] = None) -> torch.Tensor:
+    def copy_to_gpu(self, n: int | None = None) -> torch.Tensor:
         if n is None:
             return self.gpu.copy_(self.cpu, non_blocking=True)
         return self.gpu[:n].copy_(self.cpu[:n], non_blocking=True)
 
-    def copy_to_cpu(self, n: Optional[int] = None) -> torch.Tensor:
+    def copy_to_cpu(self, n: int | None = None) -> torch.Tensor:
         """NOTE: Because this method is non-blocking, explicit synchronization
         is needed to ensure the data is copied to CPU."""
         if n is None:
@@ -173,7 +172,7 @@ class APIServerProcessManager:
         num_servers: int,
         input_addresses: list[str],
         output_addresses: list[str],
-        stats_update_address: Optional[str] = None,
+        stats_update_address: str | None = None,
     ):
         """Initialize and start API server worker processes.
 
@@ -227,9 +226,8 @@ class APIServerProcessManager:
 
 def wait_for_completion_or_failure(
     api_server_manager: APIServerProcessManager,
-    engine_manager: Optional[
-        Union["CoreEngineProcManager", "CoreEngineActorManager"]
-    ] = None,
+    engine_manager: Union["CoreEngineProcManager", "CoreEngineActorManager"]
+    | None = None,
     coordinator: Optional["DPCoordinator"] = None,
 ) -> None:
     """Wait for all processes to complete or detect if any fail.
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 0c44834b55056..9bf06d51609f6 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Union
 
 import numpy as np
 import torch
@@ -215,7 +214,7 @@ class BlockTable:
         return self.block_table.np
 
     def _make_buffer(
-        self, *size: Union[int, torch.SymInt], dtype: torch.dtype
+        self, *size: int | torch.SymInt, dtype: torch.dtype
     ) -> CpuGpuBuffer:
         return CpuGpuBuffer(
             *size, dtype=dtype, device=self.device, pin_memory=self.pin_memory
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 299567427027e..5aebfec06dfd5 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any
 
 import torch
 import torch.nn as nn
@@ -95,7 +95,7 @@ class CPUModelRunner(GPUModelRunner):
     def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
         return sampled_token_ids.tolist()
 
-    def get_dp_padding(self, num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
+    def get_dp_padding(self, num_tokens: int) -> tuple[int, torch.Tensor | None]:
         # Note: For CPU backend, dp padding is not required for now.
         return 0, None
 
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index ee865ec8e6493..d3cf457ab5da4 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import platform
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 
@@ -91,7 +91,7 @@ class CPUWorker(Worker):
         logger.warning("sleep mode is not supported on CPU, ignore it.")
         pass
 
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    def wake_up(self, tags: list[str] | None = None) -> None:
         logger.warning("sleep mode is not supported on CPU, ignore it.")
         pass
 
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 1bb6a6f4d05f7..3f24ff0a09de9 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
 
 import numpy as np
 import torch
@@ -95,7 +94,7 @@ def _synchronize_dp_ranks(
     should_attempt_ubatching: bool,
     should_attempt_dp_padding: bool,
     parallel_config: ParallelConfig,
-) -> tuple[bool, Optional[torch.Tensor]]:
+) -> tuple[bool, torch.Tensor | None]:
     """
     1. Decides if each DP rank is going to microbatch. Either all ranks
     run with microbatching or none of them do.
@@ -156,10 +155,10 @@ def coordinate_batch_across_dp(
     allow_microbatching: bool,
     allow_dp_padding: bool,
     parallel_config: ParallelConfig,
-    num_tokens_padded: Optional[int] = None,
-    uniform_decode: Optional[bool] = None,
-    num_scheduled_tokens_per_request: Optional[np.ndarray] = None,
-) -> tuple[Optional[UBatchSlices], Optional[torch.Tensor]]:
+    num_tokens_padded: int | None = None,
+    uniform_decode: bool | None = None,
+    num_scheduled_tokens_per_request: np.ndarray | None = None,
+) -> tuple[UBatchSlices | None, torch.Tensor | None]:
     """
     Coordinates amongst all DP ranks to determine if and how the full batch
     should be split into microbatches.
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 0ced400bcb663..b8751546f7673 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -3,7 +3,7 @@
 # Datastructures defining a GPU input batch
 
 from dataclasses import dataclass
-from typing import Optional, cast
+from typing import cast
 
 import numpy as np
 import torch
@@ -29,21 +29,21 @@ from vllm.v1.worker.block_table import MultiGroupBlockTable
 @dataclass
 class CachedRequestState:
     req_id: str
-    prompt_token_ids: Optional[list[int]]
+    prompt_token_ids: list[int] | None
     mm_features: list[MultiModalFeatureSpec]
-    sampling_params: Optional[SamplingParams]
-    pooling_params: Optional[PoolingParams]
-    generator: Optional[torch.Generator]
+    sampling_params: SamplingParams | None
+    pooling_params: PoolingParams | None
+    generator: torch.Generator | None
 
     block_ids: tuple[list[int], ...]
     num_computed_tokens: int
     output_token_ids: list[int]
 
-    mrope_positions: Optional[torch.Tensor] = None
-    mrope_position_delta: Optional[int] = None
+    mrope_positions: torch.Tensor | None = None
+    mrope_position_delta: int | None = None
 
-    lora_request: Optional[LoRARequest] = None
-    prompt_embeds: Optional[torch.Tensor] = None
+    lora_request: LoRARequest | None = None
+    prompt_embeds: torch.Tensor | None = None
 
     def __post_init__(self):
         self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
@@ -78,7 +78,7 @@ class InputBatch:
         vocab_size: int,
         block_sizes: list[int],  # The block_size of each kv cache group
         kernel_block_sizes: list[int],
-        logitsprocs: Optional[LogitsProcessors] = None,
+        logitsprocs: LogitsProcessors | None = None,
         logitsprocs_need_output_token_ids: bool = False,
         is_spec_decode: bool = False,
         is_pooling_model: bool = False,
@@ -93,7 +93,7 @@ class InputBatch:
         self.pin_memory = pin_memory
         self.vocab_size = vocab_size
 
-        self._req_ids: list[Optional[str]] = []
+        self._req_ids: list[str | None] = []
         self.req_id_to_index: dict[str, int] = {}
 
         # TODO(woosuk): This buffer could be too large if max_model_len is big.
@@ -228,15 +228,15 @@ class InputBatch:
         self.has_allowed_token_ids: set[str] = set()
         # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
         # the value is False. Since we use masked_fill_ to set -inf.
-        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
-        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
+        self.allowed_token_ids_mask: torch.Tensor | None = None
+        self.allowed_token_ids_mask_cpu_tensor: torch.Tensor | None = None
 
         # req_index -> bad_words_token_ids
         self.bad_words_token_ids: dict[int, list[list[int]]] = {}
 
         self.logits_processing_needs_token_ids = np.zeros(max_num_reqs, dtype=bool)
 
-        self.req_output_token_ids: list[Optional[list[int]]] = []
+        self.req_output_token_ids: list[list[int] | None] = []
 
         # Store provided logitsprocs. If none are provided, initialize empty
         # data structure
@@ -244,7 +244,7 @@ class InputBatch:
         self.logitsprocs_need_output_token_ids = logitsprocs_need_output_token_ids
 
         # Store last speculative tokens for sampler.
-        self.spec_token_ids: list[Optional[list[int]]] = []
+        self.spec_token_ids: list[list[int] | None] = []
 
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
@@ -252,13 +252,13 @@ class InputBatch:
         self.pooling_params: dict[str, PoolingParams] = {}
 
         # Cached reference to the GPU tensor of previously sampled tokens
-        self.prev_sampled_token_ids: Optional[torch.Tensor] = None
-        self.prev_req_id_to_index: Optional[dict[str, int]] = None
+        self.prev_sampled_token_ids: torch.Tensor | None = None
+        self.prev_req_id_to_index: dict[str, int] | None = None
         # These are used to update output_token_ids with real sampled
         # ids from prior step, if required by current sampling params
         # (e.g. penalties).
-        self.sampled_token_ids_cpu: Optional[torch.Tensor] = None
-        self.async_copy_ready_event: Optional[torch.cuda.Event] = None
+        self.sampled_token_ids_cpu: torch.Tensor | None = None
+        self.async_copy_ready_event: torch.cuda.Event | None = None
 
     @property
     def req_ids(self) -> list[str]:
@@ -438,7 +438,7 @@ class InputBatch:
 
         return req_index
 
-    def remove_request(self, req_id: str) -> Optional[int]:
+    def remove_request(self, req_id: str) -> int | None:
         """This method must always be followed by a call to condense().
 
         Args:
@@ -796,7 +796,7 @@ class InputBatch:
             else []
         )
 
-        allowed_token_ids_mask: Optional[torch.Tensor] = None
+        allowed_token_ids_mask: torch.Tensor | None = None
         if not self.no_allowed_token_ids:
             assert self.allowed_token_ids_mask is not None
             copy_slice(
@@ -954,7 +954,7 @@ class InputBatch:
         )
 
     @property
-    def max_num_logprobs(self) -> Optional[int]:
+    def max_num_logprobs(self) -> int | None:
         return max(self.num_logprobs.values()) if self.num_logprobs else None
 
     @property
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ec824f6d6bf5e..0d99597fa641f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -8,14 +8,13 @@ from collections import defaultdict
 from collections.abc import Iterator
 from contextlib import contextmanager
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union, cast
+from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias, cast
 
 import numpy as np
 import torch
 import torch.distributed
 import torch.nn as nn
 from tqdm import tqdm
-from typing_extensions import TypeAlias
 
 import vllm.envs as envs
 from vllm.attention import Attention, AttentionType
@@ -162,7 +161,7 @@ logger = init_logger(__name__)
 
 AttnMetadataDict: TypeAlias = dict[str, AttentionMetadata]
 # list when ubatching is enabled
-PerLayerAttnMetadata: TypeAlias = Union[list[AttnMetadataDict], AttnMetadataDict]
+PerLayerAttnMetadata: TypeAlias = list[AttnMetadataDict] | AttnMetadataDict
 
 
 # Wrapper for ModelRunnerOutput to support overlapped execution.
@@ -295,7 +294,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # Sampler
         self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
 
-        self.eplb_state: Optional[EplbState] = None
+        self.eplb_state: EplbState | None = None
         """
         State of the expert parallelism load balancer.
 
@@ -447,14 +446,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         # CUDA event to synchronize use of reused CPU tensors between steps
         # when async scheduling is enabled.
-        self.prepare_inputs_event: Optional[torch.cuda.Event] = None
+        self.prepare_inputs_event: torch.cuda.Event | None = None
         if self.use_async_scheduling:
             self.prepare_inputs_event = torch.cuda.Event()
             # Start in a completed state.
             self.prepare_inputs_event.record(torch.cuda.default_stream())
 
         # None in the first PP rank. The rest are set after load_model.
-        self.intermediate_tensors: Optional[IntermediateTensors] = None
+        self.intermediate_tensors: IntermediateTensors | None = None
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
         # Keep in int64 to avoid overflow with long context
@@ -495,7 +494,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             else None
         )
 
-        self.reorder_batch_threshold: Optional[int] = None
+        self.reorder_batch_threshold: int | None = None
 
         # Attention layers that are only in the KVCacheConfig of the runner
         # (e.g., KV sharing, encoder-only attention), but not in the
@@ -503,7 +502,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self.runner_only_attn_layers: set[str] = set()
 
         # Cached outputs.
-        self._draft_token_ids: Optional[Union[list[list[int]], torch.Tensor]] = None
+        self._draft_token_ids: list[list[int]] | torch.Tensor | None = None
         self.transfer_event = torch.cuda.Event()
         self.sampled_token_ids_pinned_cpu = torch.empty(
             (self.max_model_len, 1),
@@ -527,7 +526,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             return self.positions.gpu[num_tokens]
 
     def _make_buffer(
-        self, *size: Union[int, torch.SymInt], dtype: torch.dtype, numpy: bool = True
+        self, *size: int | torch.SymInt, dtype: torch.dtype, numpy: bool = True
     ) -> CpuGpuBuffer:
         return CpuGpuBuffer(
             *size,
@@ -928,7 +927,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def _get_cumsum_and_arange(
         self,
         num_tokens: np.ndarray,
-        cumsum_dtype: Optional[np.dtype] = None,
+        cumsum_dtype: np.dtype | None = None,
     ) -> tuple[np.ndarray, np.ndarray]:
         """Get the cumulative sum and batched arange of the given array.
         # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2])
@@ -1024,7 +1023,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         scheduler_output: "SchedulerOutput",
         kv_cache_spec: KVCacheSpec,
         num_reqs: int,
-    ) -> Optional[np.ndarray]:
+    ) -> np.ndarray | None:
         if not isinstance(kv_cache_spec, CrossAttentionSpec):
             return None
 
@@ -1042,12 +1041,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     ) -> tuple[
         PerLayerAttnMetadata,
         torch.Tensor,
-        Optional[SpecDecodeMetadata],
+        SpecDecodeMetadata | None,
         np.ndarray,
-        Optional[CommonAttentionMetadata],
+        CommonAttentionMetadata | None,
         int,
-        Optional[UBatchSlices],
-        Optional[torch.Tensor],
+        UBatchSlices | None,
+        torch.Tensor | None,
         bool,
     ]:
         """
@@ -2048,7 +2047,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         )
         self._sync_device()
 
-        pooler_output: list[Optional[torch.Tensor]] = []
+        pooler_output: list[torch.Tensor | None] = []
         for raw_output, seq_len, prompt_len in zip(
             raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens
         ):
@@ -2091,13 +2090,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self,
         scheduler_output: "SchedulerOutput",
         num_input_tokens: int,  # Padded
-        intermediate_tensors: Optional[IntermediateTensors] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
     ) -> tuple[
         int,
-        Optional[torch.Tensor],
-        Optional[torch.Tensor],
+        torch.Tensor | None,
+        torch.Tensor | None,
         torch.Tensor,
-        Optional[IntermediateTensors],
+        IntermediateTensors | None,
         dict[str, Any],
     ]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -2197,8 +2196,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def _sample(
         self,
-        logits: Optional[torch.Tensor],
-        spec_decode_metadata: Optional[SpecDecodeMetadata],
+        logits: torch.Tensor | None,
+        spec_decode_metadata: SpecDecodeMetadata | None,
     ) -> SamplerOutput:
         # Sample the next token and get logprobs if needed.
         sampling_metadata = self.input_batch.sampling_metadata
@@ -2243,14 +2242,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self,
         scheduler_output: "SchedulerOutput",
         sampler_output: SamplerOutput,
-        logits: Optional[torch.Tensor],
+        logits: torch.Tensor | None,
         hidden_states: torch.Tensor,
         num_scheduled_tokens: int,
     ) -> tuple[
         dict[str, int],
-        Optional[LogprobsLists],
+        LogprobsLists | None,
         list[list[int]],
-        dict[str, Optional[LogprobsTensors]],
+        dict[str, LogprobsTensors | None],
         list[str],
         dict[str, int],
         list[int],
@@ -2377,10 +2376,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def _model_forward(
         self,
-        input_ids: Optional[torch.Tensor] = None,
-        positions: Optional[torch.Tensor] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        input_ids: torch.Tensor | None = None,
+        positions: torch.Tensor | None = None,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
         **model_kwargs: dict[str, Any],
     ) -> Any:
         """Helper method to call the model forward pass.
@@ -2411,8 +2410,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]:
+        intermediate_tensors: IntermediateTensors | None = None,
+    ) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors:
         with record_function_or_nullcontext("Preprocess"):
             with self.synchronize_input_prep():
                 # Update persistent batch states.
@@ -2678,7 +2677,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         return async_output
 
-    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
         if self._draft_token_ids is None:
             return None
         req_ids = self.input_batch.req_ids
@@ -2692,14 +2691,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
-        sampled_token_ids: Union[torch.Tensor, list[list[int]]],
+        sampled_token_ids: torch.Tensor | list[list[int]],
         sampling_metadata: SamplingMetadata,
         hidden_states: torch.Tensor,
         sample_hidden_states: torch.Tensor,
-        aux_hidden_states: Optional[list[torch.Tensor]],
-        spec_decode_metadata: Optional[SpecDecodeMetadata],
+        aux_hidden_states: list[torch.Tensor] | None,
+        spec_decode_metadata: SpecDecodeMetadata | None,
         common_attn_metadata: CommonAttentionMetadata,
-    ) -> Union[list[list[int]], torch.Tensor]:
+    ) -> list[list[int]] | torch.Tensor:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if self.speculative_config.method == "ngram":
             assert isinstance(sampled_token_ids, list)
@@ -2961,7 +2960,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                     self.model, self.vllm_config, CUDAGraphMode.NONE, self.device
                 )
 
-    def _get_eagle3_aux_layers_from_config(self) -> Optional[tuple[int, ...]]:
+    def _get_eagle3_aux_layers_from_config(self) -> tuple[int, ...] | None:
         """Extract Eagle3 auxiliary layer indices from speculative config.
 
         These indices specify which hidden states from the base model should
@@ -3007,13 +3006,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self,
         hidden_states: torch.Tensor,
         num_scheduled_tokens: dict[str, int],
-    ) -> dict[str, Optional[LogprobsTensors]]:
+    ) -> dict[str, LogprobsTensors | None]:
         num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
         if not num_prompt_logprobs_dict:
             return {}
 
         in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu
-        prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
+        prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
 
         # Since prompt logprobs are a rare feature, prioritize simple,
         # maintainable loop over optimal performance.
@@ -3107,7 +3106,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def _get_nans_in_logits(
         self,
-        logits: Optional[torch.Tensor],
+        logits: torch.Tensor | None,
     ) -> dict[str, int]:
         try:
             if logits is None:
@@ -3190,7 +3189,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def _dummy_run(
         self,
         num_tokens: int,
-        cudagraph_runtime_mode: Optional[CUDAGraphMode] = None,
+        cudagraph_runtime_mode: CUDAGraphMode | None = None,
         force_attention: bool = False,
         uniform_decode: bool = False,
         allow_microbatching: bool = True,
@@ -3294,7 +3293,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             dp_rank = self.parallel_config.data_parallel_rank
             num_tokens_after_padding = int(num_tokens_across_dp[dp_rank])
 
-        attn_metadata: Optional[PerLayerAttnMetadata] = None
+        attn_metadata: PerLayerAttnMetadata | None = None
 
         # If force_attention is True, we always capture attention. Otherwise,
         # it only happens for cudagraph_runtime_mode=FULL.
diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
index fb63fe8d25430..3e6fd86e95d88 100644
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import threading
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, Optional
+from typing import Any
 
 import torch
 
@@ -32,8 +33,8 @@ class UbatchMetadata:
     context: UBatchContext
     input_ids: torch.Tensor
     positions: torch.Tensor
-    inputs_embeds: Optional[torch.Tensor]
-    intermediate_tensors: Optional[IntermediateTensors]
+    inputs_embeds: torch.Tensor | None
+    intermediate_tensors: IntermediateTensors | None
     num_tokens: int
 
 
@@ -41,7 +42,7 @@ class UbatchMetadata:
 class CUDAGraphMetaData:
     cudagraph: torch.cuda.CUDAGraph
     ubatch_metadata: UbatchMetadata
-    outputs: Optional[Any] = None
+    outputs: Any | None = None
 
 
 class SMControlContextManager:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 119e474b1fca9..0e9ab3f9148b9 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -6,7 +6,7 @@ import copy
 import gc
 import os
 from contextlib import AbstractContextManager, nullcontext
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any
 
 import torch
 import torch.distributed
@@ -132,7 +132,7 @@ class Worker(WorkerBase):
             used_bytes / GiB_bytes,
         )
 
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+    def wake_up(self, tags: list[str] | None = None) -> None:
         from vllm.device_allocator.cumem import CuMemAllocator
 
         allocator = CuMemAllocator.get_instance()
@@ -456,7 +456,7 @@ class Worker(WorkerBase):
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]:
+    ) -> ModelRunnerOutput | AsyncModelRunnerOutput | None:
         intermediate_tensors = None
         forward_pass = scheduler_output.total_num_scheduled_tokens > 0
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -504,7 +504,7 @@ class Worker(WorkerBase):
         output.kv_connector_output = kv_connector_output
         return output
 
-    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+    def take_draft_token_ids(self) -> DraftTokenIds | None:
         return self.model_runner.take_draft_token_ids()
 
     def profile(self, is_start: bool = True):
@@ -565,7 +565,7 @@ class Worker(WorkerBase):
         self,
         old_ep_size: int,
         new_ep_size: int,
-        global_expert_load: Optional[torch.Tensor],
+        global_expert_load: torch.Tensor | None,
     ) -> None:
         from vllm.distributed.parallel_state import get_ep_group
 
@@ -611,7 +611,7 @@ class Worker(WorkerBase):
 
     def _reconfigure_moe(
         self, old_ep_size: int, new_ep_size: int
-    ) -> Optional[torch.Tensor]:
+    ) -> torch.Tensor | None:
         """
         Reconfigure MoE modules with provided reconfig_request
 
@@ -730,8 +730,8 @@ class Worker(WorkerBase):
     def save_sharded_state(
         self,
         path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
+        pattern: str | None = None,
+        max_size: int | None = None,
     ) -> None:
         from vllm.model_executor.model_loader import ShardedStateLoader
 
@@ -758,7 +758,7 @@ class Worker(WorkerBase):
 def init_worker_distributed_environment(
     vllm_config: VllmConfig,
     rank: int,
-    distributed_init_method: Optional[str] = None,
+    distributed_init_method: str | None = None,
     local_rank: int = -1,
     backend: str = "nccl",
 ) -> None:
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index 473982bebb127..db037a9fccd5c 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -9,7 +9,6 @@ from collections.abc import Generator
 from contextlib import AbstractContextManager, contextmanager, nullcontext
 from typing import (
     TYPE_CHECKING,  # noqa: UP035
-    Optional,
 )
 
 from vllm.config import VllmConfig
@@ -65,7 +64,7 @@ class KVConnectorModelRunnerMixin:
     @staticmethod
     def get_finished_kv_transfers(
         scheduler_output: "SchedulerOutput",
-    ) -> tuple[Optional[set[str]], Optional[set[str]]]:
+    ) -> tuple[set[str] | None, set[str] | None]:
         if has_kv_transfer_group():
             return get_kv_transfer_group().get_finished(
                 scheduler_output.finished_req_ids
@@ -95,7 +94,7 @@ class KVConnectorModelRunnerMixin:
     @staticmethod
     def maybe_get_kv_connector_output(
         scheduler_output: "SchedulerOutput",
-    ) -> AbstractContextManager[Optional[KVConnectorOutput]]:
+    ) -> AbstractContextManager[KVConnectorOutput | None]:
         return (
             KVConnectorModelRunnerMixin._get_kv_connector_output(scheduler_output)
             if has_kv_transfer_group()
@@ -139,7 +138,7 @@ class KVConnectorModelRunnerMixin:
             kv_connector.clear_connector_metadata()
 
     @staticmethod
-    def get_kv_connector_stats() -> Optional[KVConnectorStats]:
+    def get_kv_connector_stats() -> KVConnectorStats | None:
         if has_kv_transfer_group():
             return get_kv_transfer_group().get_kv_connector_stats()
         return None
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 45b7a548d1843..3057d3dc00e82 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -5,7 +5,6 @@ Define LoRA functionality mixin for model runners.
 """
 
 from contextlib import contextmanager
-from typing import Optional, Union
 
 import numpy as np
 import torch
@@ -21,7 +20,7 @@ from vllm.model_executor.models import supports_lora, supports_multimodal
 from vllm.v1.worker.gpu_input_batch import InputBatch as GPUInputBatch
 from vllm.v1.worker.tpu_input_batch import InputBatch as TPUInputBatch
 
-InputBatch = Union[TPUInputBatch, GPUInputBatch]
+InputBatch = TPUInputBatch | GPUInputBatch
 
 logger = init_logger(__name__)
 
@@ -85,7 +84,7 @@ class LoRAModelRunnerMixin:
 
     @contextmanager
     def maybe_setup_dummy_loras(
-        self, lora_config: Optional[LoRAConfig], remove_lora: bool = True
+        self, lora_config: LoRAConfig | None, remove_lora: bool = True
     ):
         if lora_config is None:
             yield
@@ -121,7 +120,7 @@ class LoRAModelRunnerMixin:
 
     @contextmanager
     def maybe_select_dummy_loras(
-        self, lora_config: Optional[LoRAConfig], num_scheduled_tokens: np.ndarray
+        self, lora_config: LoRAConfig | None, num_scheduled_tokens: np.ndarray
     ):
         if lora_config is None:
             yield
@@ -158,7 +157,7 @@ class LoRAModelRunnerMixin:
     @contextmanager
     def maybe_dummy_run_with_lora(
         self,
-        lora_config: Optional[LoRAConfig],
+        lora_config: LoRAConfig | None,
         num_scheduled_tokens: np.ndarray,
         remove_lora: bool = True,
     ):
@@ -168,7 +167,7 @@ class LoRAModelRunnerMixin:
         ):
             yield
 
-    def maybe_remove_all_loras(self, lora_config: Optional[LoRAConfig]):
+    def maybe_remove_all_loras(self, lora_config: LoRAConfig | None):
         if lora_config is None:
             return
         self.lora_manager.remove_all_adapters()
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index ef115ade09ab8..80b62066c8df9 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Datastructures defining a TPU input batch
 
-from typing import Optional, cast
+from typing import cast
 
 import numpy as np
 import torch
@@ -36,7 +36,7 @@ class InputBatch:
         self.pin_memory = pin_memory
         self.vocab_size = vocab_size
 
-        self._req_ids: list[Optional[str]] = []
+        self._req_ids: list[str | None] = []
         self.req_id_to_index: dict[str, int] = {}
 
         # TODO(woosuk): This buffer could be too large if max_model_len is big.
@@ -155,17 +155,17 @@ class InputBatch:
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
 
-        self.logit_bias: list[Optional[dict[int, float]]] = [None] * max_num_reqs
+        self.logit_bias: list[dict[int, float] | None] = [None] * max_num_reqs
         self.has_allowed_token_ids: set[str] = set()
         # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
         # the value is False. Since we use masked_fill_ to set -inf.
-        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
-        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
+        self.allowed_token_ids_mask: torch.Tensor | None = None
+        self.allowed_token_ids_mask_cpu_tensor: torch.Tensor | None = None
 
         # req_index -> bad_words_token_ids
         self.bad_words_token_ids: dict[int, list[list[int]]] = {}
 
-        self.req_output_token_ids: list[Optional[list[int]]] = []
+        self.req_output_token_ids: list[list[int] | None] = []
 
     @property
     def req_ids(self) -> list[str]:
@@ -176,7 +176,7 @@ class InputBatch:
     def add_request(
         self,
         request: "CachedRequestState",
-        req_index: Optional[int] = None,
+        req_index: int | None = None,
     ) -> None:
         if req_index is None:
             req_index = self.num_reqs
@@ -296,7 +296,7 @@ class InputBatch:
             # No LoRA
             self.request_lora_mapping[req_index] = 0
 
-    def remove_request(self, req_id: str) -> Optional[int]:
+    def remove_request(self, req_id: str) -> int | None:
         """This method must always be followed by a call to condense()."""
 
         req_index = self.req_id_to_index.pop(req_id, None)
@@ -580,7 +580,7 @@ class InputBatch:
         )
 
     @property
-    def max_num_logprobs(self) -> Optional[int]:
+    def max_num_logprobs(self) -> int | None:
         return max(self.num_logprobs.values()) if self.num_logprobs else None
 
     @property
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index f9e1fcedc8903..6fd71259dbcbf 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -3,7 +3,7 @@
 import bisect
 import gc
 import time
-from typing import TYPE_CHECKING, Any, Optional, cast
+from typing import TYPE_CHECKING, Any, cast
 from unittest.mock import patch
 
 import numpy as np
@@ -140,7 +140,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         self,
         vllm_config: VllmConfig,
         device: torch.device,
-        original_parallel_config: Optional[ParallelConfig] = None,
+        original_parallel_config: ParallelConfig | None = None,
     ):
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
@@ -1050,7 +1050,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def _get_model_inputs(
         self,
         input_ids: torch.Tensor,
-        mm_embed_inputs: Optional[tuple[list[torch.Tensor], torch.Tensor]],
+        mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None,
     ):
         if self.supports_mm_inputs:
             mm_embeds, is_mm_embed = mm_embed_inputs or (None, None)
@@ -1076,7 +1076,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
-        intermediate_tensors: Optional[IntermediateTensors] = None,
+        intermediate_tensors: IntermediateTensors | None = None,
     ) -> ModelRunnerOutput:
         # Update cached state
         self._update_states(scheduler_output)
@@ -1220,7 +1220,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         ), "req_ids contains None"
         req_ids = cast(list[str], self.input_batch.req_ids[:num_reqs])
 
-        prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
+        prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
         for req_id in self.input_batch.req_ids[:num_reqs]:
             prompt_logprobs_dict[req_id] = None
 
@@ -2127,8 +2127,8 @@ def replace_set_lora(model):
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
+        embeddings_tensor: torch.Tensor | None,
+        bias: torch.Tensor | None = None,
     ):
         # TODO: The integer index leads to a recompilation, but converting it
         # to a tensor doesn't seem to work anymore. This might be fixed with a
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index b64cec318f6c6..9bce362120acf 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -3,7 +3,8 @@
 """A TPU worker class."""
 
 import os
-from typing import Any, Callable, Optional, TypeVar
+from collections.abc import Callable
+from typing import Any, TypeVar
 
 import torch
 import torch.distributed
@@ -257,7 +258,7 @@ class TPUWorker:
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> Optional[ModelRunnerOutput]:
+    ) -> ModelRunnerOutput | None:
         output = self.model_runner.execute_model(scheduler_output)
         # every worker's output is needed when kv_transfer_group is set up
         return output if self.is_driver_worker or has_kv_transfer_group() else None
@@ -317,7 +318,7 @@ class TPUWorker:
         self,
         vllm_config: VllmConfig,
         rank: int,
-        distributed_init_method: Optional[str] = None,
+        distributed_init_method: str | None = None,
         local_rank: int = -1,
     ) -> None:
         """Initialize the distributed environment."""
diff --git a/vllm/v1/worker/ubatch_utils.py b/vllm/v1/worker/ubatch_utils.py
index ef22977e094b2..33a1921d2d98e 100644
--- a/vllm/v1/worker/ubatch_utils.py
+++ b/vllm/v1/worker/ubatch_utils.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
+from typing import TypeAlias
 
 import numpy as np
-from typing_extensions import TypeAlias
 
 from vllm.config import ParallelConfig
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 6657a2a8db828..d63978b32c187 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 import torch
 
@@ -193,7 +193,7 @@ def sanity_check_mm_encoder_outputs(
 
 def scatter_mm_placeholders(
     embeds: torch.Tensor,
-    is_embed: Optional[torch.Tensor],
+    is_embed: torch.Tensor | None,
 ) -> torch.Tensor:
     """
     Scatter the multimodal embeddings into a contiguous tensor that represents
@@ -221,7 +221,7 @@ def scatter_mm_placeholders(
 
 def gather_mm_placeholders(
     placeholders: torch.Tensor,
-    is_embed: Optional[torch.Tensor],
+    is_embed: torch.Tensor | None,
 ) -> torch.Tensor:
     """
     Reconstructs the embeddings from the placeholder tokens.
@@ -238,7 +238,7 @@ def gather_mm_placeholders(
 def add_kv_sharing_layers_to_kv_cache_groups(
     shared_kv_cache_layers: dict[str, str],
     kv_cache_groups: list[KVCacheGroupSpec],
-    runner_only_attn_layers: Optional[set[str]] = None,
+    runner_only_attn_layers: set[str] | None = None,
 ) -> None:
     """
     Sets up KV cache sharing by reusing the allocated KV caches in `kv_caches`
@@ -270,7 +270,7 @@ def bind_kv_cache(
     kv_caches: dict[str, torch.Tensor],
     forward_context: dict[str, "Attention"],
     runner_kv_caches: list[torch.Tensor],
-    num_attn_module: Optional[int] = 1,
+    num_attn_module: int | None = 1,
 ) -> None:
     """
     Bind the allocated KV cache to both ModelRunner and forward context so
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 8ee3b240904ca..85436b443f7c0 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -1,10 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from __future__ import annotations
-
 import os
-from typing import TYPE_CHECKING, Any, Callable, TypeVar, Union
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, TypeVar
 
 import torch
 import torch.nn as nn
@@ -26,6 +25,9 @@ from vllm.v1.kv_cache_interface import KVCacheSpec
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.outputs import ModelRunnerOutput
+else:
+    SchedulerOutput = object
+    ModelRunnerOutput = object
 
 logger = init_logger(__name__)
 
@@ -329,7 +331,7 @@ class WorkerWrapperBase:
             # To make vLLM config available during device initialization
             self.worker.init_device()  # type: ignore
 
-    def execute_method(self, method: Union[str, bytes], *args, **kwargs):
+    def execute_method(self, method: str | bytes, *args, **kwargs):
         try:
             # method resolution order:
             # if a method is defined in this class, it will be called directly.

From 18ed7746eacb6fd3d885d489343ff1bd711361eb Mon Sep 17 00:00:00 2001
From: gjgjos <gjgjos@naver.com>
Date: Mon, 13 Oct 2025 02:00:52 +0900
Subject: [PATCH 25/30] [Feature] Add support for naver/splade-v3 (BERT-based
 sparse embedding model) (#26339)

Signed-off-by: gjgjos <gjgjos@naver.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .../pooling/test_splade_sparse_pooler.py      | 122 ++++++++++
 tests/models/registry.py                      |   3 +
 vllm/model_executor/models/bert.py            | 214 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 4 files changed, 340 insertions(+)
 create mode 100644 tests/models/language/pooling/test_splade_sparse_pooler.py

diff --git a/tests/models/language/pooling/test_splade_sparse_pooler.py b/tests/models/language/pooling/test_splade_sparse_pooler.py
new file mode 100644
index 0000000000000..636a6f2f9d74b
--- /dev/null
+++ b/tests/models/language/pooling/test_splade_sparse_pooler.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import types
+
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.models.bert import (
+    BertMLMHead,
+    SPLADESparsePooler,
+)
+
+# ---------------------------------------------------------------------
+# 1) Functional test: SPLADE formula correctness (no HF download needed)
+# ---------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("B,T,H,V", [(2, 3, 5, 7)])
+def test_splade_pooler_matches_reference_formula(B, T, H, V):
+    """Ensure SPLADESparsePooler forward() matches the mathematical formula:
+    log1p(relu(logits)) -> max over sequence length (after masking)."""
+    torch.manual_seed(0)
+
+    # Prepare [B] sequences of shape [T, H]
+    hs_list = [torch.randn(T, H) for _ in range(B)]
+
+    # Simulate PoolingMetadata (only required fields)
+    prompt_lens = [T, T - 1]
+    token_ids = torch.tensor(
+        [
+            [101, 5, 102],  # Batch 0: [CLS], token, [SEP]
+            [101, 6, 6],  # Batch 1: [CLS], token, token (last token ignored)
+        ],
+        dtype=torch.long,
+    )
+    meta = types.SimpleNamespace(prompt_lens=prompt_lens, prompt_token_ids=token_ids)
+
+    # MLM head (prefer BertMLMHead, fallback to Linear if unavailable)
+    try:
+        mlm_head = BertMLMHead(hidden_size=H, vocab_size=V, layer_norm_eps=1e-12)
+    except Exception:
+        mlm_head = nn.Linear(H, V, bias=True)
+
+    # Forward pass through SPLADE pooler
+    pooler = SPLADESparsePooler(mlm_head=mlm_head, pooling="max", remove_cls_sep=True)
+    pooled = pooler(hidden_states=hs_list, pooling_metadata=meta)  # list of [V]
+
+    # Basic output checks
+    assert isinstance(pooled, list) and len(pooled) == B
+    for vec in pooled:
+        assert vec.shape == (V,)
+        assert torch.isfinite(vec).all()
+        assert (vec >= 0).all(), "SPLADE outputs must be non-negative."
+
+    # Reference implementation for comparison
+    def ref_one(hs: torch.Tensor, L: int, tid_row: torch.Tensor) -> torch.Tensor:
+        keep = torch.ones(L, dtype=torch.bool)
+        if L > 0 and tid_row[0].item() == 101:  # remove CLS
+            keep[0] = False
+        if L > 0 and tid_row[L - 1].item() == 102:  # remove SEP
+            keep[L - 1] = False
+
+        valid = hs[:L][keep[:L]]
+        if valid.numel() == 0:
+            return torch.zeros(V, dtype=torch.float32)
+
+        logits = mlm_head(valid)  # [L', V]
+        scores = torch.log1p(torch.relu(logits))  # [L', V]
+        return scores.max(dim=0).values.to(torch.float32)
+
+    torch.testing.assert_close(
+        pooled[0],
+        ref_one(hs_list[0], prompt_lens[0], token_ids[0]),
+        rtol=1e-4,
+        atol=1e-4,
+    )
+    torch.testing.assert_close(
+        pooled[1],
+        ref_one(hs_list[1], prompt_lens[1], token_ids[1]),
+        rtol=1e-4,
+        atol=1e-4,
+    )
+
+
+# ---------------------------------------------------------------------
+# 2) Integration smoke test: end-to-end embedding path wiring
+# ---------------------------------------------------------------------
+
+
+@pytest.mark.cpu_model
+def test_bert_splade_sparse_embed_smoke(vllm_runner, monkeypatch):
+    """Ensure BertSpladeSparseEmbeddingModel loads and produces sparse embeddings."""
+    from transformers import AutoTokenizer
+
+    MODEL_ID = "hf-internal-testing/tiny-random-bert"
+    hf_overrides = {"architectures": ["BertSpladeSparseEmbeddingModel"]}
+
+    # Enforce CPU-only execution (optional)
+    monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "")
+    monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
+    tok = AutoTokenizer.from_pretrained(MODEL_ID)
+    vocab_size = tok.vocab_size
+
+    # The embed path should route through SPLADESparsePooler
+    with vllm_runner(
+        MODEL_ID,
+        runner="pooling",
+        max_model_len=64,
+        hf_overrides=hf_overrides,
+    ) as vm:
+        outs = vm.embed(["hello world", "splade sparse test"])
+
+        # Basic sanity checks
+        assert len(outs) == 2
+        assert outs[0].shape[0] == vocab_size
+        assert outs[1].shape[0] == vocab_size
+        assert np.isfinite(outs[0]).all() and (outs[0] >= 0).all()
+        assert np.isfinite(outs[1]).all() and (outs[1] >= 0).all()
diff --git a/tests/models/registry.py b/tests/models/registry.py
index ad90229adf8a7..fbc11c2ddfd4c 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -486,6 +486,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),
     "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),
     "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"),
+    "BertSpladeSparseEmbeddingModel": _HfExamplesInfo(
+        "naver/splade-v3", is_available_online=False
+    ),
     # [Multimodal]
     "CLIPModel": _HfExamplesInfo("openai/clip-vit-base-patch32"),
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index e07da3d4d29ae..df302aee0bf6b 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -572,6 +572,220 @@ def _decode_token_type_ids(input_ids: torch.Tensor) -> torch.Tensor:
     return token_type_ids
 
 
+class BertMLMHead(nn.Module):
+    def __init__(
+        self, hidden_size: int, vocab_size: int, layer_norm_eps: float = 1e-12
+    ):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.activation = nn.GELU()
+        self.layer_norm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+        self.decoder = nn.Linear(hidden_size, vocab_size, bias=True)
+
+    def tie_weights_with_embeddings(self, embeddings_weight: torch.Tensor):
+        self.decoder.weight = embeddings_weight
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self.dense(hidden_states)
+        x = self.activation(x)
+        x = self.layer_norm(x)
+        logits = self.decoder(x)
+        return logits
+
+
+class SPLADESparsePooler(Pooler):
+    """
+    SPLADE sparse pooling:
+    logits = mlm_head(hidden_states)
+            -> log1p(relu(logits))
+            -> (max|sum over L)
+            -> [V]
+
+    Padding is masked with an attention mask,
+    [CLS]/[SEP] is removed (selected),
+    and then pooled.
+    """
+
+    def __init__(
+        self,
+        mlm_head: nn.Module,
+        cls_token_id: Optional[int] = 101,
+        sep_token_id: Optional[int] = 102,
+        pooling: str = "max",
+        remove_cls_sep: bool = True,
+    ):
+        super().__init__()
+        assert pooling in ("max", "sum")
+        self.mlm_head = mlm_head
+        self.cls_token_id = cls_token_id
+        self.sep_token_id = sep_token_id
+        self.pooling = pooling
+        self.remove_cls_sep = remove_cls_sep
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"embed"}
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return PoolingParamsUpdate(requires_token_ids=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> torch.Tensor:
+        assert isinstance(hidden_states, torch.Tensor) and hidden_states.dim() == 2
+
+        lens_tensor: torch.Tensor = pooling_metadata.prompt_lens
+        lens: list[int] = lens_tensor.tolist()
+        B: int = len(lens)
+
+        token_ids = pooling_metadata.prompt_token_ids
+        offset = 0
+        pooled_list: list[torch.Tensor] = []
+
+        for i in range(B):
+            L = int(lens[i])
+            hs = hidden_states[offset : offset + L]
+
+            start_idx = 0
+            end_idx = L
+            if self.remove_cls_sep and token_ids is not None:
+                if (
+                    self.cls_token_id is not None
+                    and token_ids[i, 0].item() == self.cls_token_id
+                ):
+                    start_idx = 1
+                if (
+                    self.sep_token_id is not None
+                    and token_ids[i, L - 1].item() == self.sep_token_id
+                ):
+                    end_idx = max(start_idx, L - 1)
+
+            if end_idx <= start_idx:
+                V = int(self.mlm_head.decoder.out_features)
+                pooled_list.append(hs.new_zeros((V,)))
+                offset += L
+                continue
+
+            logits_i = self.mlm_head(hs[start_idx:end_idx])
+            scores_i = torch.log1p(torch.relu(logits_i))
+
+            if self.pooling == "sum":
+                pooled_i = scores_i.sum(dim=0)
+            else:  # "max"
+                pooled_i = scores_i.max(dim=0).values
+
+            pooled_list.append(pooled_i.contiguous())
+            offset += L
+
+        return torch.stack(pooled_list, dim=0).contiguous()
+
+
+@default_pooling_type("CLS")
+class BertSpladeSparseEmbeddingModel(BertEmbeddingModel):
+    """
+    BertEmbeddingModel + SPLADE sparse embedding.
+    - Make logits by self.mlm_head
+    - pooler: SPLADESparsePooler(mlm_head...)
+    """
+
+    def __init__(
+        self, *, vllm_config: VllmConfig, prefix: str = "", splade_pooling: str = "max"
+    ):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        cfg = vllm_config.model_config.hf_config
+
+        # MLM head
+        self.mlm_head = BertMLMHead(
+            hidden_size=cfg.hidden_size,
+            vocab_size=cfg.vocab_size,
+            layer_norm_eps=getattr(cfg, "layer_norm_eps", 1e-12),
+        )
+
+        self._splade_pooling = splade_pooling
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+        self.pooler = self._build_pooler(pooler_config)
+
+    def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
+        cfg = self.model.config
+
+        if not hasattr(self, "mlm_head"):
+            self.mlm_head = BertMLMHead(
+                hidden_size=cfg.hidden_size,
+                vocab_size=cfg.vocab_size,
+                layer_norm_eps=getattr(cfg, "layer_norm_eps", 1e-12),
+            )
+
+        pooling_mode = getattr(self, "_splade_pooling", "max")
+
+        cls_id = getattr(cfg, "cls_token_id", None)
+        sep_id = getattr(cfg, "sep_token_id", None)
+
+        return DispatchPooler(
+            {
+                "encode": Pooler.for_encode(pooler_config),
+                "embed": SPLADESparsePooler(
+                    mlm_head=self.mlm_head,
+                    cls_token_id=cls_id,
+                    sep_token_id=sep_id,
+                    pooling=pooling_mode,  # "max" or "sum"
+                    remove_cls_sep=True,
+                ),
+            }
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        if not hasattr(self, "mlm_head"):
+            cfg = self.model.config
+            self.mlm_head = BertMLMHead(
+                hidden_size=cfg.hidden_size,
+                vocab_size=cfg.vocab_size,
+                layer_norm_eps=getattr(cfg, "layer_norm_eps", 1e-12),
+            )
+
+        def _strip(name: str) -> str:
+            for p in ("model.", "bert."):
+                if name.startswith(p):
+                    name = name[len(p) :]
+            return name
+
+        weights_list = list(weights)
+        model_side: list[tuple[str, torch.Tensor]] = []
+        mlm_side: list[tuple[str, torch.Tensor]] = []
+
+        for k, w in weights_list:
+            name = _strip(k)
+            if name.startswith("cls.predictions."):
+                mlm_side.append((name, w))
+            else:
+                model_side.append((name, w))
+
+        loaded: set[str] = set()
+        loaded_model = self.model.load_weights(model_side)
+        loaded.update({"model." + n for n in loaded_model})
+
+        if mlm_side:
+            name_map = {
+                "cls.predictions.transform.dense.weight": "mlm_head.dense.weight",
+                "cls.predictions.transform.dense.bias": "mlm_head.dense.bias",
+                ("cls.predictions.transform.LayerNorm.weight"): (
+                    "mlm_head.layer_norm.weight"
+                ),
+                ("cls.predictions.transform.LayerNorm.bias"): (
+                    "mlm_head.layer_norm.bias"
+                ),
+                "cls.predictions.decoder.weight": "mlm_head.decoder.weight",
+                "cls.predictions.decoder.bias": "mlm_head.decoder.bias",
+            }
+            remapped = [(name_map[n], w) for n, w in mlm_side if n in name_map]
+            if remapped:
+                loaded_mlm = AutoWeightsLoader(self).load_weights(remapped)
+                loaded.update(loaded_mlm)
+
+        return loaded
+
+
 @default_pooling_type("CLS")
 class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 194d2593a7fe5..92ad19a20e024 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -172,6 +172,7 @@ _TEXT_GENERATION_MODELS = {
 _EMBEDDING_MODELS = {
     # [Text-only]
     "BertModel": ("bert", "BertEmbeddingModel"),
+    "BertSpladeSparseEmbeddingModel": ("bert", "BertSpladeSparseEmbeddingModel"),
     "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "Gemma3TextModel": ("gemma3", "Gemma3Model"),

From a6049be73cb965bad04f6657de6c4d98261a5237 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sun, 12 Oct 2025 18:20:07 +0100
Subject: [PATCH 26/30] [Models][Qwen3VL] Speedup `fast_pos_embed_interpolate`
 (#26647)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/model_executor/models/qwen3_vl.py | 29 ++++++++++----------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 0f706ab55a07a..39714faf9833e 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -467,8 +467,6 @@ class Qwen3_VisionTransformer(nn.Module):
             dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing="ij")
             h_floor_grid, w_floor_grid = torch.meshgrid(h_floor, w_floor, indexing="ij")
             h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil, w_ceil, indexing="ij")
-            h_floor_grid_idx = h_floor_grid * num_grid_per_side
-            h_ceil_grid_idx = h_ceil_grid * num_grid_per_side
 
             # original computation of weights
             # w00 = (1 - dh_grid) * (1 - dw_grid)
@@ -480,30 +478,25 @@ class Qwen3_VisionTransformer(nn.Module):
             w11 = dh_grid * dw_grid
             w10 = dh_grid - w11
             w01 = dw_grid - w11
-            w00 = 1 - dh_grid - dw_grid + w11
+            w00 = 1 - dh_grid - w01
 
-            idx00 = h_floor_grid_idx + w_floor_grid
-            idx01 = h_floor_grid_idx + w_ceil_grid
-            idx10 = h_ceil_grid_idx + w_floor_grid
-            idx11 = h_ceil_grid_idx + w_ceil_grid
+            h_grid = torch.stack([h_floor_grid, h_floor_grid, h_ceil_grid, h_ceil_grid])
+            w_grid = torch.stack([w_floor_grid, w_ceil_grid, w_floor_grid, w_ceil_grid])
+            h_grid_idx = h_grid * num_grid_per_side
 
-            indices = torch.stack([idx00, idx01, idx10, idx11], dim=0).reshape(4, -1)
+            indices = (h_grid_idx + w_grid).reshape(4, -1)
             weights = torch.stack([w00, w01, w10, w11], dim=0).reshape(4, -1, 1)
-            weights = weights.to(
-                dtype=self.dtype, device=self.device, non_blocking=True
-            )
+            weights = weights.to(dtype=self.dtype)
 
             embeds = self.pos_embed(indices)
             weighted_embeds = embeds * weights
-            p0, p1, p2, p3 = weighted_embeds.unbind(dim=0)
-            combined = p0 + p1 + p2 + p3
+            combined = weighted_embeds.sum(dim=0)
 
-            combined = combined.view(h * w, hidden_dim)
-            repeated = combined.unsqueeze(0).expand(t, -1, -1).contiguous()
-            repeated = repeated.view(
-                t, h // m_size, m_size, w // m_size, m_size, hidden_dim
+            combined = combined.reshape(
+                h // m_size, m_size, w // m_size, m_size, hidden_dim
             )
-            repeated = repeated.permute(0, 1, 3, 2, 4, 5).reshape(-1, hidden_dim)
+            combined = combined.permute(0, 2, 1, 3, 4).reshape(1, -1, hidden_dim)
+            repeated = combined.expand(t, -1, -1).reshape(-1, hidden_dim)
             outputs.append(repeated)
 
         return torch.cat(outputs, dim=0)

From 4fca1a1bd25a1b0d3b49f3fa832425cef5a612fb Mon Sep 17 00:00:00 2001
From: Huamin Li <3ericli@gmail.com>
Date: Sun, 12 Oct 2025 14:25:34 -0700
Subject: [PATCH 27/30] [easy] fix pre commit error on trunk (#26665)

Signed-off-by: Huamin Li <3ericli@gmail.com>
---
 vllm/model_executor/models/bert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index df302aee0bf6b..6e81eb8dc91b3 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -609,8 +609,8 @@ class SPLADESparsePooler(Pooler):
     def __init__(
         self,
         mlm_head: nn.Module,
-        cls_token_id: Optional[int] = 101,
-        sep_token_id: Optional[int] = 102,
+        cls_token_id: int | None = 101,
+        sep_token_id: int | None = 102,
         pooling: str = "max",
         remove_cls_sep: bool = True,
     ):

From 7ef6052804819b270e5a69e54dc52ae9a016cf2d Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sun, 12 Oct 2025 18:25:40 -0400
Subject: [PATCH 28/30] [CI/Build] Add tool to build vllm-tpu wheel (#19165)

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
---
 setup.py                |  5 +++
 tools/vllm-tpu/build.sh | 67 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 tools/vllm-tpu/build.sh

diff --git a/setup.py b/setup.py
index 60dde120d5004..990fe4cde3ca7 100644
--- a/setup.py
+++ b/setup.py
@@ -540,6 +540,11 @@ def get_gaudi_sw_version():
 
 
 def get_vllm_version() -> str:
+    # Allow overriding the version. This is useful to build platform-specific
+    # wheels (e.g. CPU, TPU) without modifying the source.
+    if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
+        return env_version
+
     version = get_version(write_to="vllm/_version.py")
     sep = "+" if "+" not in version else "."  # dev versions might contain +
 
diff --git a/tools/vllm-tpu/build.sh b/tools/vllm-tpu/build.sh
new file mode 100644
index 0000000000000..fbc91e379df33
--- /dev/null
+++ b/tools/vllm-tpu/build.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+set -e # Exit immediately if a command exits with a non-zero status.
+# Script to build VLLM wheel for TPU with an optional version override.
+
+SCRIPT_PATH_PARAM="$0"
+TOOLS_DIR=$(cd "$(dirname "$SCRIPT_PATH_PARAM")" && pwd) # Absolute path to the script's directory
+REPO_ROOT=$(cd "$TOOLS_DIR/../../" && pwd) # Absolute path to the repo root
+VLLM_DIR="$REPO_ROOT/" # Path to the vllm sources
+
+# Ensure we are not running from within the vllm directory if SCRIPT_PATH_PARAM is relative like "."
+if [ "$TOOLS_DIR" = "$VLLM_DIR" ]; then
+    echo "Error: This script should not be run from the vllm directory directly if using relative paths."
+    echo "Place it in a subdirectory like 'tools/vllm-tpu' and run it from the repository root or via its full path."
+    exit 1
+fi
+
+# Optional version argument
+if [ -n "$1" ]; then
+    USER_VERSION="$1"
+    export VLLM_VERSION_OVERRIDE="$USER_VERSION"
+    echo "User defined version: $USER_VERSION"
+else
+    echo "No version override supplied. Using default version from source."
+fi
+
+PYPROJECT_FILE="$VLLM_DIR/pyproject.toml"
+
+# Backup and update the project name.
+if ! grep -q "name = \"vllm-tpu\"" "$PYPROJECT_FILE"; then
+    echo "Patching pyproject.toml project name to vllm-tpu..."
+    cp "$PYPROJECT_FILE" "${PYPROJECT_FILE}.bak"
+    sed -i '0,/^name = "vllm"/s//name = "vllm-tpu"/' "$PYPROJECT_FILE"
+    PATCHED=true
+else
+    PATCHED=false
+fi
+
+# Navigate to the vllm directory
+cd "$VLLM_DIR"
+
+# Cleanup function to be called on exit or error
+cleanup() {
+    echo "Cleaning up..."
+    if [ "$PATCHED" = true ]; then
+        echo "Restoring original pyproject.toml..."
+        cp "${PYPROJECT_FILE}.bak" "$PYPROJECT_FILE"
+        rm -f "${PYPROJECT_FILE}.bak"
+    fi
+}
+trap cleanup EXIT HUP INT QUIT PIPE TERM # Register cleanup function to run on script exit and various signals
+
+echo "Updating pyproject.toml completed. Proceeding with build..."
+
+echo "Building wheel for TPU..."
+rm -rf dist/
+mkdir -p dist/
+
+# User confirmed to use 'python -m build' directly
+if ! VLLM_TARGET_DEVICE=tpu python -m build; then
+    echo "Error: Python build command failed. Check if 'python -m build' works and the 'build' module is installed."
+    exit 1
+fi
+
+trap - EXIT HUP INT QUIT PIPE TERM
+cleanup
+
+exit 0 
\ No newline at end of file

From 60e419c1eeef90425acbd6d34bfadf4202707507 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Sun, 12 Oct 2025 20:17:50 -0400
Subject: [PATCH 29/30] [Misc] cache result of disable_inplace (#26666)

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 vllm/model_executor/layers/fused_moe/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index a682f848b0c4f..e5957474630ca 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import functools
 from math import prod
 
 import torch
@@ -325,5 +326,6 @@ def activation_without_mul(activation: str) -> str:
 # Torch custom ops can't deal with outputs aliasing inputs so we need to
 # disable inplace for torch >= 2.9.
 # See https://github.com/vllm-project/vllm/issues/26378
+@functools.cache
 def disable_inplace() -> bool:
     return is_torch_equal_or_newer("2.9")

From 41f3884438c082c4cc2250eb800b8586e1a103d8 Mon Sep 17 00:00:00 2001
From: quanliu <33453350+quanliu1991@users.noreply.github.com>
Date: Mon, 13 Oct 2025 09:25:42 +0800
Subject: [PATCH 30/30] [Bugfix][Core]Fix block table out-of-range issue in
 priority scheduling (#26661)

Signed-off-by: quanliu <18646313696@163.com>
---
 vllm/v1/core/sched/scheduler.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 32c2eb8a46526..16808417766ae 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -271,6 +271,9 @@ class Scheduler(SchedulerInterface):
                     self.running.remove(preempted_req)
                     if preempted_req in scheduled_running_reqs:
                         scheduled_running_reqs.remove(preempted_req)
+                        token_budget += num_scheduled_tokens[preempted_req.request_id]
+                        req_to_new_blocks.pop(preempted_req.request_id)
+                        num_scheduled_tokens.pop(preempted_req.request_id)
                 else:
                     preempted_req = self.running.pop()