From be067861c6ab4df53a45e332caf53c163330290d Mon Sep 17 00:00:00 2001 From: Chauncey Date: Sat, 11 Oct 2025 10:43:39 +0800 Subject: [PATCH 01/30] [Frontend] Improve the performance of `is_reasoning_end` (#25735) Signed-off-by: chaunceyjiang --- vllm/reasoning/basic_parsers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py index b4106a4f57945..f47ffe6212caf 100644 --- a/vllm/reasoning/basic_parsers.py +++ b/vllm/reasoning/basic_parsers.py @@ -59,7 +59,8 @@ class BaseThinkingReasoningParser(ReasoningParser): ) def is_reasoning_end(self, input_ids: list[int]) -> bool: - return self.end_token_id in input_ids + end_token_id = self.end_token_id + return any(input_id == end_token_id for input_id in reversed(input_ids)) def extract_content_ids(self, input_ids: list[int]) -> list[int]: """ From 8f8474fbe357c587fee838cf76e59c465593a2fd Mon Sep 17 00:00:00 2001 From: Nishidha Panpaliya Date: Sat, 11 Oct 2025 10:34:42 +0530 Subject: [PATCH 02/30] [CI/Build] Fix ppc64le CPU build and tests (#22443) Signed-off-by: Nishidha Panpaliya --- .../hardware_ci/run-cpu-test-ppc64le.sh | 15 +++-- cmake/cpu_extension.cmake | 2 +- docker/Dockerfile.ppc64le | 64 +++++++++++++------ 3 files changed, 55 insertions(+), 26 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh index 36bcb015d308e..39ea180173081 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh @@ -25,25 +25,28 @@ function cpu_tests() { # offline inference podman exec -it "$container_id" bash -c " - set -e - python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + set -xve + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log # Run basic model test podman exec -it "$container_id" bash -c " - set -e + set -evx pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib pip install sentence-transformers datamodel_code_generator - pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model + + # Note: disable Bart until supports V1 + # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] - pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" + # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. + # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log } # All of CPU tests are expected to be finished less than 40 mins. export container_id export -f cpu_tests -timeout 40m bash -c cpu_tests +timeout 120m bash -c cpu_tests diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index c962564c8da08..a6e53588f4f0f 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -309,4 +309,4 @@ define_gpu_extension_target( WITH_SOABI ) -message(STATUS "Enabling C extension.") +message(STATUS "Enabling C extension.") \ No newline at end of file diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le index 5eaef4ea980de..ad9eae94b83dd 100644 --- a/docker/Dockerfile.ppc64le +++ b/docker/Dockerfile.ppc64le @@ -1,4 +1,4 @@ -ARG BASE_UBI_IMAGE_TAG=9.5-1741850109 +ARG BASE_UBI_IMAGE_TAG=9.6-1754584681 ############################################################### # Stage to build openblas @@ -7,7 +7,7 @@ ARG BASE_UBI_IMAGE_TAG=9.5-1741850109 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder ARG MAX_JOBS -ARG OPENBLAS_VERSION=0.3.29 +ARG OPENBLAS_VERSION=0.3.30 RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \ && source /opt/rh/gcc-toolset-13/enable \ && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \ @@ -38,7 +38,7 @@ RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel FROM centos-deps-builder AS base-builder ARG PYTHON_VERSION=3.12 -ARG OPENBLAS_VERSION=0.3.29 +ARG OPENBLAS_VERSION=0.3.30 # Set Environment Variables for venv, cargo & openblas ENV VIRTUAL_ENV=/opt/vllm @@ -61,7 +61,7 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/, pkgconfig xsimd zeromq-devel kmod findutils protobuf* \ libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \ harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \ - python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \ + python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip clang-devel \ && dnf clean all \ && PREFIX=/usr/local make -C /openblas install \ && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \ @@ -79,9 +79,9 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/, FROM base-builder AS torch-builder ARG MAX_JOBS -ARG TORCH_VERSION=2.6.0 +ARG TORCH_VERSION=2.7.0 ARG _GLIBCXX_USE_CXX11_ABI=1 -ARG OPENBLAS_VERSION=0.3.29 +ARG OPENBLAS_VERSION=0.3.30 RUN --mount=type=cache,target=/root/.cache/uv \ source /opt/rh/gcc-toolset-13/enable && \ @@ -93,7 +93,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ MAX_JOBS=${MAX_JOBS:-$(nproc)} \ PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/ -ARG TORCHVISION_VERSION=0.21.0 +ARG TORCHVISION_VERSION=0.22.0 ARG TORCHVISION_USE_NVJPEG=0 ARG TORCHVISION_USE_FFMPEG=0 RUN --mount=type=cache,target=/root/.cache/uv \ @@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ BUILD_VERSION=${TORCHVISION_VERSION} \ uv build --wheel --out-dir /torchwheels/ --no-build-isolation -ARG TORCHAUDIO_VERSION=2.6.0 +ARG TORCHAUDIO_VERSION=2.7.0 ARG BUILD_SOX=1 ARG BUILD_KALDI=1 ARG BUILD_RNNT=1 @@ -128,7 +128,7 @@ FROM base-builder AS arrow-builder ARG MAX_JOBS ARG PYARROW_PARALLEL -ARG PYARROW_VERSION=19.0.1 +ARG PYARROW_VERSION=21.0.0 RUN --mount=type=cache,target=/root/.cache/uv \ source /opt/rh/gcc-toolset-13/enable && \ git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \ @@ -145,7 +145,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ make install -j ${MAX_JOBS:-$(nproc)} && \ cd ../../python/ && \ uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \ - pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \ PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \ python setup.py build_ext \ --build-type=release --bundle-arrow-cpp \ @@ -187,6 +186,23 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V && make -j ${MAX_JOBS:-$(nproc)} +############################################################### +# Stage to build numba +############################################################### + +FROM base-builder AS numba-builder + +ARG MAX_JOBS +ARG NUMBA_VERSION=0.61.2 + +# Clone all required dependencies +RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \ + git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \ + cd ./numba && \ + if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \ + sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \ + fi && python -m build --wheel --installer=uv --outdir /numbawheels/ + ############################################################### # Stage to build vllm - this stage builds and installs # vllm, tensorizer and vllm-tgis-adapter and builds uv cache @@ -199,6 +215,7 @@ COPY --from=torch-builder /tmp/control /dev/null COPY --from=arrow-builder /tmp/control /dev/null COPY --from=cv-builder /tmp/control /dev/null COPY --from=numa-builder /tmp/control /dev/null +COPY --from=numba-builder /tmp/control /dev/null ARG VLLM_TARGET_DEVICE=cpu ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 @@ -206,6 +223,8 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 # this step installs vllm and populates uv cache # with all the transitive dependencies RUN --mount=type=cache,target=/root/.cache/uv \ + dnf install llvm15 llvm15-devel -y && \ + rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \ source /opt/rh/gcc-toolset-13/enable && \ git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \ uv pip install maturin && \ @@ -215,15 +234,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \ --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \ --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \ + --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \ --mount=type=bind,src=.,dst=/src/,rw \ source /opt/rh/gcc-toolset-13/enable && \ - uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \ + export PATH=$PATH:/usr/lib64/llvm15/bin && \ + uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \ sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \ - uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \ + sed -i -e 's/.*sentencepiece.*//g' /src/pyproject.toml /src/requirements/*.txt && \ + uv pip install sentencepiece==0.2.0 pandas pythran nanobind pybind11 /hf_wheels/*.whl && \ make -C /numactl install && \ # sentencepiece.pc is in some pkgconfig inside uv cache export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \ - uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \ + nanobind_DIR=$(uv pip show nanobind | grep Location | sed 's/^Location: //;s/$/\/nanobind\/cmake/') && uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \ cd /src/ && \ uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \ uv pip install /vllmwheel/*.whl @@ -250,7 +272,7 @@ RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${L FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai ARG PYTHON_VERSION=3.12 -ARG OPENBLAS_VERSION=0.3.29 +ARG OPENBLAS_VERSION=0.3.30 # Set Environment Variables for venv & openblas ENV VIRTUAL_ENV=/opt/vllm @@ -268,6 +290,7 @@ COPY --from=vllmcache-builder /tmp/control /dev/null COPY --from=numa-builder /tmp/control /dev/null COPY --from=lapack-builder /tmp/control /dev/null COPY --from=openblas-builder /tmp/control /dev/null +COPY --from=numba-builder /tmp/control /dev/null # install gcc-11, python, openblas, numactl, lapack RUN --mount=type=cache,target=/root/.cache/uv \ @@ -276,13 +299,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \ rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ microdnf install --nodocs -y \ - tar findutils openssl \ + libomp tar findutils openssl llvm15 llvm15-devel \ pkgconfig xsimd g++ gcc-fortran libsndfile \ libtiff libjpeg openjpeg2 zlib zeromq \ freetype lcms2 libwebp tcl tk utf8proc \ - harfbuzz fribidi libraqm libimagequant libxcb \ + harfbuzz fribidi libraqm libimagequant libxcb util-linux \ python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \ - && microdnf clean all \ + && export PATH=$PATH:/usr/lib64/llvm15/bin && microdnf clean all \ && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \ && python -m pip install -U pip uv --no-cache \ && make -C /numactl install \ @@ -298,7 +321,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \ --mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \ --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \ - HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl + --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \ + export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && uv pip install sentencepiece==0.2.0 && \ + HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl + COPY ./ /workspace/vllm WORKDIR /workspace/vllm @@ -314,4 +340,4 @@ WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks -ENTRYPOINT ["vllm", "serve"] +ENTRYPOINT ["vllm", "serve"] \ No newline at end of file From 27ed39a347c56ac23f25169b7c86d7f459b1ac7f Mon Sep 17 00:00:00 2001 From: liuzhenwei Date: Sat, 11 Oct 2025 13:15:23 +0800 Subject: [PATCH 03/30] [XPU] Upgrade NIXL to remove CUDA dependency (#26570) Signed-off-by: zhenwei-intel --- .buildkite/scripts/hardware_ci/run-xpu-test.sh | 1 - docker/Dockerfile.xpu | 5 +++++ requirements/xpu.txt | 1 - tools/install_nixl_from_source_ubuntu.py | 1 + vllm/platforms/xpu.py | 15 ++++++++------- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 2fd7265fa5366..250a64fdd071c 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -44,6 +44,5 @@ docker run \ pytest -v -s v1/structured_output pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py - pytest -v -s v1/test_metrics pytest -v -s v1/test_serial_utils.py ' diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index ffc3abd389653..49ea39cad5128 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -69,4 +69,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # install development dependencies (for testing) RUN python3 -m pip install -e tests/vllm_test_utils + +# install nixl from source code +RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/" + ENTRYPOINT ["vllm", "serve"] diff --git a/requirements/xpu.txt b/requirements/xpu.txt index 5d52400e50bc6..d14b631aa9364 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -10,7 +10,6 @@ wheel jinja2>=3.1.6 datasets # for benchmark scripts numba == 0.61.2 # Required for N-gram speculative decoding -nixl==0.3.0 # for PD disaggregation torch==2.8.0+xpu torchaudio torchvision diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py index c903e3f1d3f18..c808b01d2e94b 100644 --- a/tools/install_nixl_from_source_ubuntu.py +++ b/tools/install_nixl_from_source_ubuntu.py @@ -135,6 +135,7 @@ def build_and_install_prerequisites(args): "--enable-devel-headers", "--with-verbs", "--enable-mt", + "--with-ze=no", ] run_command(configure_command, cwd=ucx_source_path) run_command(["make", "-j", str(os.cpu_count() or 1)], cwd=ucx_source_path) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index e0c8a6605b7d4..b75b52938839b 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -54,6 +54,14 @@ class XPUPlatform(Platform): has_sink: bool, use_sparse, ) -> str: + from vllm.v1.attention.backends.utils import set_kv_cache_layout + + set_kv_cache_layout("NHD") + logger.info( + "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; " + "only NHD layout is supported by XPU attention kernels." + ) + from vllm.attention.backends.registry import _Backend if use_sparse: @@ -190,13 +198,6 @@ class XPUPlatform(Platform): vllm_config.scheduler_config.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS, ) - from vllm.v1.attention.backends.utils import set_kv_cache_layout - - set_kv_cache_layout("NHD") - logger.info( - "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; " - "only NHD layout is supported by XPU attention kernels." - ) @classmethod def support_hybrid_kv_cache(cls) -> bool: From ddaff2938e0b78b2d6237f6f7975ac19167cb04e Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 10 Oct 2025 22:17:24 -0700 Subject: [PATCH 04/30] [MM] Move Qwen3Omni MRoPE impl to model file (#26608) Signed-off-by: Roger Wang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../layers/rotary_embedding/mrope.py | 361 +----------------- .../models/qwen3_omni_moe_thinker.py | 355 +++++++++++++++-- vllm/model_executor/models/vision.py | 37 ++ vllm/v1/worker/gpu_model_runner.py | 2 +- 4 files changed, 368 insertions(+), 387 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index 0a13543c82e19..ebfe9257c6c45 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -426,7 +426,7 @@ class MRotaryEmbedding(RotaryEmbedding): ) -> tuple[torch.Tensor, int]: from vllm.transformers_utils.config import thinker_uses_mrope - if thinker_uses_mrope(hf_config): + if thinker_uses_mrope(hf_config) and hf_config.model_type == "qwen2_5_omni": return cls._omni_get_input_positions_tensor( input_tokens=input_tokens, hf_config=hf_config, @@ -1119,339 +1119,6 @@ class MRotaryEmbedding(RotaryEmbedding): return llm_positions, mrope_position_delta - @classmethod - def _omni3_get_input_positions_tensor( - cls, - config, - input_ids: torch.Tensor, - image_grid_thw: torch.Tensor, - video_grid_thw: torch.Tensor, - use_audio_in_video: bool = False, - audio_seqlens: Optional[torch.Tensor] = None, - second_per_grids: Optional[torch.Tensor] = None, - ) -> tuple[torch.Tensor, torch.Tensor]: - def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor): - input_lengths_leave = input_lengths % 100 - feat_lengths = (input_lengths_leave - 1) // 2 + 1 - output_lengths = ( - ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13 - ) - return output_lengths - - if input_ids is None or input_ids.ndim != 1: - raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids") - - seq_len = input_ids.shape[0] - device = input_ids.device - dtype = input_ids.dtype - - if image_grid_thw is not None: - image_grid_thw = image_grid_thw.to(device=device, dtype=torch.long) - if video_grid_thw is not None: - video_grid_thw = video_grid_thw.to(device=device, dtype=torch.long) - - if second_per_grids is None: - if video_grid_thw is not None and video_grid_thw.numel() > 0: - second_per_grids = torch.ones( - video_grid_thw.shape[0], dtype=torch.float32, device=device - ) - else: - second_per_grids = torch.tensor([], dtype=torch.float32, device=device) - else: - second_per_grids = second_per_grids.to(device=device, dtype=torch.float32) - - if audio_seqlens is not None: - audio_seqlens = audio_seqlens.to(device=device, dtype=torch.long) - - spatial_merge_size = config.vision_config.spatial_merge_size - image_token_id = config.image_token_id - video_token_id = config.video_token_id - audio_token_id = config.audio_token_id - vision_start_token_id = config.vision_start_token_id - audio_start_token_id = config.audio_start_token_id - position_id_per_seconds = config.position_id_per_seconds - - vision_start_indices = torch.argwhere( - input_ids == vision_start_token_id - ).squeeze(1) - if vision_start_indices.numel() > 0: - vision_tokens = input_ids[vision_start_indices + 1] - else: - vision_tokens = input_ids.new_empty((0,), dtype=input_ids.dtype) - audio_nums = torch.sum(input_ids == audio_start_token_id) - image_nums = (vision_tokens == image_token_id).sum() - video_nums = ( - (vision_tokens == audio_start_token_id).sum() - if use_audio_in_video - else (vision_tokens == video_token_id).sum() - ) - - input_tokens = input_ids.tolist() - llm_pos_ids_list: list[torch.Tensor] = [] - st = 0 - image_idx = 0 - video_idx = 0 - audio_idx = 0 - remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums # noqa: E501 - multimodal_nums = ( - image_nums + audio_nums - if use_audio_in_video - else image_nums + video_nums + audio_nums - ) # noqa: E501 - - for _ in range(multimodal_nums): - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - if (image_token_id in input_tokens or video_token_id in input_tokens) and ( - remain_videos > 0 or remain_images > 0 - ): - ed_vision_start = input_tokens.index(vision_start_token_id, st) - else: - ed_vision_start = len(input_tokens) + 1 - if audio_token_id in input_tokens and remain_audios > 0: - ed_audio_start = input_tokens.index(audio_start_token_id, st) - else: - ed_audio_start = len(input_tokens) + 1 - min_ed = min(ed_vision_start, ed_audio_start) - - if min_ed == ed_audio_start: - text_len = min_ed - st - if text_len != 0: - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - llm_pos_ids_list.append( - torch.arange(text_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - bos_len = 1 - llm_pos_ids_list.append( - torch.arange(bos_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - audio_len = _get_feat_extract_output_lengths(audio_seqlens[audio_idx]) - llm_pos_ids = ( - torch.arange(audio_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - llm_pos_ids_list.append(llm_pos_ids) - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - eos_len = 1 - llm_pos_ids_list.append( - torch.arange(eos_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - st += text_len + bos_len + audio_len + eos_len - audio_idx += 1 - remain_audios -= 1 - elif ( - min_ed == ed_vision_start - and input_ids[ed_vision_start + 1] == image_token_id - ): - text_len = min_ed - st - if text_len != 0: - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - llm_pos_ids_list.append( - torch.arange(text_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - bos_len = 1 - llm_pos_ids_list.append( - torch.arange(bos_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - grid_t = image_grid_thw[image_idx][0] - grid_hs = image_grid_thw[:, 1] - grid_ws = image_grid_thw[:, 2] - t_index = torch.arange(grid_t, device=device) * position_id_per_seconds - llm_pos_ids = cls._get_llm_pos_ids_for_vision( - st_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws - ) - image_len = image_grid_thw[image_idx].prod() // (spatial_merge_size**2) - llm_pos_ids_list.append(llm_pos_ids) - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - eos_len = 1 - llm_pos_ids_list.append( - torch.arange(eos_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - st += text_len + bos_len + image_len + eos_len - image_idx += 1 - remain_images -= 1 - elif ( - min_ed == ed_vision_start - and input_ids[ed_vision_start + 1] == video_token_id - and not use_audio_in_video - ): - text_len = min_ed - st - if text_len != 0: - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - llm_pos_ids_list.append( - torch.arange(text_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - bos_len = 1 - llm_pos_ids_list.append( - torch.arange(bos_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - grid_t = video_grid_thw[video_idx][0] - grid_hs = video_grid_thw[:, 1] - grid_ws = video_grid_thw[:, 2] - t_index = ( - torch.arange(grid_t, device=device) - * float(second_per_grids[video_idx].item()) - * position_id_per_seconds - ) - llm_pos_ids = cls._get_llm_pos_ids_for_vision( - st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws - ) - video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2) - llm_pos_ids_list.append(llm_pos_ids) - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - eos_len = 1 - llm_pos_ids_list.append( - torch.arange(eos_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - st += text_len + bos_len + video_len + eos_len - video_idx += 1 - remain_videos -= 1 - elif ( - min_ed == ed_vision_start - and ed_vision_start + 1 == ed_audio_start - and use_audio_in_video - ): - text_len = min_ed - st - if text_len != 0: - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - llm_pos_ids_list.append( - torch.arange(text_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - bos_len = 1 - bos_block = ( - torch.arange(bos_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - llm_pos_ids_list.append(bos_block) - llm_pos_ids_list.append(bos_block) - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - audio_len = _get_feat_extract_output_lengths(audio_seqlens[audio_idx]) - audio_llm_pos_ids = ( - torch.arange(audio_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - grid_t = video_grid_thw[video_idx][0] - grid_hs = video_grid_thw[:, 1] - grid_ws = video_grid_thw[:, 2] - t_index = ( - torch.arange(grid_t, device=device) - * float(second_per_grids[video_idx].item()) - * position_id_per_seconds - ) - video_llm_pos_ids = cls._get_llm_pos_ids_for_vision( - st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws - ) - video_data_index, audio_data_index = 0, 0 - while ( - video_data_index < video_llm_pos_ids.shape[-1] - and audio_data_index < audio_llm_pos_ids.shape[-1] - ): - if ( - video_llm_pos_ids[0][video_data_index] - <= audio_llm_pos_ids[0][audio_data_index] - ): - llm_pos_ids_list.append( - video_llm_pos_ids[ - :, video_data_index : video_data_index + 1 - ] - ) - video_data_index += 1 - else: - llm_pos_ids_list.append( - audio_llm_pos_ids[ - :, audio_data_index : audio_data_index + 1 - ] - ) - audio_data_index += 1 - if video_data_index < video_llm_pos_ids.shape[-1]: - llm_pos_ids_list.append( - video_llm_pos_ids[ - :, video_data_index : video_llm_pos_ids.shape[-1] - ] - ) - if audio_data_index < audio_llm_pos_ids.shape[-1]: - llm_pos_ids_list.append( - audio_llm_pos_ids[ - :, audio_data_index : audio_llm_pos_ids.shape[-1] - ] - ) - video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2) - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - eos_len = 1 - eos_block = ( - torch.arange(eos_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - llm_pos_ids_list.append(eos_block) - llm_pos_ids_list.append(eos_block) - st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2 # noqa: E501 - audio_idx += 1 - video_idx += 1 - remain_videos -= 1 - remain_audios -= 1 - - if st < len(input_tokens): - st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - text_len = len(input_tokens) - st - llm_pos_ids_list.append( - torch.arange(text_len, device=device, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) - - llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - if llm_positions.shape[1] != seq_len: - raise RuntimeError("Position ids length mismatch with input ids length") - - position_ids = llm_positions.to(device=device, dtype=dtype) - mrope_position_delta = llm_positions.max() + 1 - seq_len - return position_ids, mrope_position_delta - @classmethod def _omni_get_input_positions_tensor( cls, @@ -1483,8 +1150,6 @@ class MRotaryEmbedding(RotaryEmbedding): # TODO(fyabc): refactor and share more code with # _vl_get_input_positions_tensor. - - model_type = hf_config.model_type thinker_config = hf_config.thinker_config if isinstance(image_grid_thw, list): @@ -1492,30 +1157,6 @@ class MRotaryEmbedding(RotaryEmbedding): if isinstance(video_grid_thw, list): video_grid_thw = torch.tensor(video_grid_thw) - if "qwen3_omni" in model_type: - input_tensor = torch.tensor(input_tokens) - audio_lengths_tensor = audio_feature_lengths - if audio_lengths_tensor is not None and not isinstance( - audio_lengths_tensor, torch.Tensor - ): - audio_lengths_tensor = torch.as_tensor( - audio_lengths_tensor, dtype=torch.long - ) - second_per_grids_tensor = ( - torch.tensor(second_per_grid_ts) if second_per_grid_ts else None - ) - - llm_positions, mrope_position_delta = cls._omni3_get_input_positions_tensor( # noqa: E501 - thinker_config, - input_tensor, - image_grid_thw, - video_grid_thw, - use_audio_in_video, - audio_lengths_tensor, - second_per_grids_tensor, - ) - return llm_positions, mrope_position_delta - audio_token_id = thinker_config.audio_token_index image_token_id = thinker_config.image_token_index video_token_id = thinker_config.video_token_index diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 8a5aa9c2be3bf..6eb9faabd1c7f 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -72,7 +72,12 @@ from vllm.multimodal.processing import ( ) from vllm.sequence import IntermediateTensors -from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .interfaces import ( + MultiModalEmbeddings, + SupportsMRoPE, + SupportsMultiModal, + SupportsPP, +) # yapf conflicts with isort for this block # yapf: disable @@ -96,7 +101,7 @@ from .utils import ( _merge_multimodal_embeddings, maybe_prefix, ) -from .vision import get_vit_attn_backend +from .vision import get_llm_pos_ids_for_vision, get_vit_attn_backend try: import flash_attn @@ -106,6 +111,15 @@ except (ImportError, ModuleNotFoundError): logger = init_logger(__name__) +def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): + input_lengths_leave = input_lengths % 100 + feat_lengths = (input_lengths_leave - 1) // 2 + 1 + output_lengths = ( + ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13 + ) + return feat_lengths, output_lengths + + class Qwen3_VisionPatchEmbed(nn.Module): def __init__( self, @@ -679,16 +693,6 @@ Qwen3OmniMoeThinkerDummyInputsBuilder = Qwen2_5OmniThinkerDummyInputsBuilder class Qwen3OmniMoeThinkerMultiModalProcessor( Qwen2_5OmniThinkerMultiModalProcessor, ): - def _get_feat_extract_output_lengths( - self, input_lengths: torch.Tensor - ) -> torch.Tensor: - input_lengths_leave = input_lengths % 100 - feat_lengths = (input_lengths_leave - 1) // 2 + 1 - output_lengths = ( - ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13 - ) - return feat_lengths, output_lengths - def _call_hf_processor( self, prompt: str, @@ -882,13 +886,13 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( if audio_feature_lengths is None and feature_attention_mask is None: audio_output_lengths = [] elif audio_feature_lengths is not None: - _, audio_output_lens = self._get_feat_extract_output_lengths( + _, audio_output_lens = _get_feat_extract_output_lengths( audio_feature_lengths ) audio_output_lengths = audio_output_lens.tolist() elif feature_attention_mask is not None: assert isinstance(feature_attention_mask, torch.Tensor) - _, audio_output_lens = self._get_feat_extract_output_lengths( + _, audio_output_lens = _get_feat_extract_output_lengths( feature_attention_mask.sum(-1) ) audio_output_lengths = audio_output_lens.tolist() @@ -1044,16 +1048,6 @@ class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMix else: return torch.concat(mm_input, dim=dim) - def _get_feat_extract_output_lengths( - self, input_lengths: torch.Tensor - ) -> torch.Tensor: - input_lengths_leave = input_lengths % 100 - feat_lengths = (input_lengths_leave - 1) // 2 + 1 - output_lengths = ( - ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13 - ) - return output_lengths, output_lengths - def _process_audio_input( self, audio_input: Qwen2AudioFeatureInputs, @@ -1072,8 +1066,8 @@ class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMix if audio_feature_lengths.ndim == 2: audio_feature_lengths = audio_feature_lengths.reshape(-1) - audio_feat_lengths, audio_output_lengths = ( - self._get_feat_extract_output_lengths(audio_feature_lengths) + audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths( + audio_feature_lengths ) audio_outputs = self.audio_tower( @@ -1094,6 +1088,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsPP, + SupportsMRoPE, Qwen3OmniMoeConditionalGenerationMixin, ): hf_to_vllm_mapper = WeightsMapper( @@ -1407,3 +1402,311 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( loaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loaded_weights + + @classmethod + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + second_per_grid_ts: Optional[list[float]] = None, + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + config = hf_config.thinker_config + if isinstance(image_grid_thw, list): + image_grid_thw = torch.tensor(image_grid_thw) + if isinstance(video_grid_thw, list): + video_grid_thw = torch.tensor(video_grid_thw) + input_ids = torch.tensor(input_tokens) + if input_ids is None or input_ids.ndim != 1: + raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids") + + seq_len = input_ids.shape[0] + if audio_feature_lengths is not None and not isinstance( + audio_feature_lengths, torch.Tensor + ): + audio_feature_lengths = torch.as_tensor( + audio_feature_lengths, dtype=torch.long + ) + if second_per_grid_ts is None: + if video_grid_thw is not None and video_grid_thw.numel() > 0: + second_per_grids = torch.ones( + video_grid_thw.shape[0], dtype=torch.float32 + ) + else: + second_per_grids = torch.tensor([], dtype=torch.float32) + else: + second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32) + + spatial_merge_size = config.vision_config.spatial_merge_size + image_token_id = config.image_token_id + video_token_id = config.video_token_id + audio_token_id = config.audio_token_id + vision_start_token_id = config.vision_start_token_id + audio_start_token_id = config.audio_start_token_id + position_id_per_seconds = config.position_id_per_seconds + + vision_start_indices = torch.argwhere( + input_ids == vision_start_token_id + ).squeeze(1) + if vision_start_indices.numel() > 0: + vision_tokens = input_ids[vision_start_indices + 1] + else: + vision_tokens = input_ids.new_empty((0,), dtype=input_ids.dtype) + audio_nums = torch.sum(input_ids == audio_start_token_id) + image_nums = (vision_tokens == image_token_id).sum() + video_nums = ( + (vision_tokens == audio_start_token_id).sum() + if use_audio_in_video + else (vision_tokens == video_token_id).sum() + ) + + llm_pos_ids_list: list[torch.Tensor] = [] + st = 0 + image_idx = 0 + video_idx = 0 + audio_idx = 0 + remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums # noqa: E501 + multimodal_nums = ( + image_nums + audio_nums + if use_audio_in_video + else image_nums + video_nums + audio_nums + ) # noqa: E501 + + for _ in range(multimodal_nums): + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + if (image_token_id in input_tokens or video_token_id in input_tokens) and ( + remain_videos > 0 or remain_images > 0 + ): + ed_vision_start = input_tokens.index(vision_start_token_id, st) + else: + ed_vision_start = len(input_tokens) + 1 + if audio_token_id in input_tokens and remain_audios > 0: + ed_audio_start = input_tokens.index(audio_start_token_id, st) + else: + ed_audio_start = len(input_tokens) + 1 + min_ed = min(ed_vision_start, ed_audio_start) + + if min_ed == ed_audio_start: + text_len = min_ed - st + if text_len != 0: + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long) + .view(1, -1) + .expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + bos_len = 1 + llm_pos_ids_list.append( + torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + _, audio_len = _get_feat_extract_output_lengths( + audio_feature_lengths[audio_idx] + ) + llm_pos_ids = ( + torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + llm_pos_ids_list.append(llm_pos_ids) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + eos_len = 1 + llm_pos_ids_list.append( + torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + st += text_len + bos_len + audio_len + eos_len + audio_idx += 1 + remain_audios -= 1 + elif ( + min_ed == ed_vision_start + and input_ids[ed_vision_start + 1] == image_token_id + ): + text_len = min_ed - st + if text_len != 0: + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long) + .view(1, -1) + .expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + bos_len = 1 + llm_pos_ids_list.append( + torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + grid_t = image_grid_thw[image_idx][0] + grid_hs = image_grid_thw[:, 1] + grid_ws = image_grid_thw[:, 2] + t_index = torch.arange(grid_t) * position_id_per_seconds + llm_pos_ids = get_llm_pos_ids_for_vision( + st_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws + ) + image_len = image_grid_thw[image_idx].prod() // (spatial_merge_size**2) + llm_pos_ids_list.append(llm_pos_ids) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + eos_len = 1 + llm_pos_ids_list.append( + torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + st += text_len + bos_len + image_len + eos_len + image_idx += 1 + remain_images -= 1 + elif ( + min_ed == ed_vision_start + and input_ids[ed_vision_start + 1] == video_token_id + and not use_audio_in_video + ): + text_len = min_ed - st + if text_len != 0: + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long) + .view(1, -1) + .expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + bos_len = 1 + llm_pos_ids_list.append( + torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + grid_t = video_grid_thw[video_idx][0] + grid_hs = video_grid_thw[:, 1] + grid_ws = video_grid_thw[:, 2] + t_index = ( + torch.arange(grid_t) + * float(second_per_grids[video_idx].item()) + * position_id_per_seconds + ) + llm_pos_ids = get_llm_pos_ids_for_vision( + st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws + ) + video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2) + llm_pos_ids_list.append(llm_pos_ids) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + eos_len = 1 + llm_pos_ids_list.append( + torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + st += text_len + bos_len + video_len + eos_len + video_idx += 1 + remain_videos -= 1 + elif ( + min_ed == ed_vision_start + and ed_vision_start + 1 == ed_audio_start + and use_audio_in_video + ): + text_len = min_ed - st + if text_len != 0: + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long) + .view(1, -1) + .expand(3, -1) + + st_idx + ) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + bos_len = 1 + bos_block = ( + torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + llm_pos_ids_list.append(bos_block) + llm_pos_ids_list.append(bos_block) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + _, audio_len = _get_feat_extract_output_lengths( + audio_feature_lengths[audio_idx] + ) + audio_llm_pos_ids = ( + torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + grid_t = video_grid_thw[video_idx][0] + grid_hs = video_grid_thw[:, 1] + grid_ws = video_grid_thw[:, 2] + t_index = ( + torch.arange(grid_t) + * float(second_per_grids[video_idx].item()) + * position_id_per_seconds + ) + video_llm_pos_ids = get_llm_pos_ids_for_vision( + st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws + ) + video_data_index, audio_data_index = 0, 0 + while ( + video_data_index < video_llm_pos_ids.shape[-1] + and audio_data_index < audio_llm_pos_ids.shape[-1] + ): + if ( + video_llm_pos_ids[0][video_data_index] + <= audio_llm_pos_ids[0][audio_data_index] + ): + llm_pos_ids_list.append( + video_llm_pos_ids[ + :, video_data_index : video_data_index + 1 + ] + ) + video_data_index += 1 + else: + llm_pos_ids_list.append( + audio_llm_pos_ids[ + :, audio_data_index : audio_data_index + 1 + ] + ) + audio_data_index += 1 + if video_data_index < video_llm_pos_ids.shape[-1]: + llm_pos_ids_list.append( + video_llm_pos_ids[ + :, video_data_index : video_llm_pos_ids.shape[-1] + ] + ) + if audio_data_index < audio_llm_pos_ids.shape[-1]: + llm_pos_ids_list.append( + audio_llm_pos_ids[ + :, audio_data_index : audio_llm_pos_ids.shape[-1] + ] + ) + video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2) + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + eos_len = 1 + eos_block = ( + torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + llm_pos_ids_list.append(eos_block) + llm_pos_ids_list.append(eos_block) + st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2 # noqa: E501 + audio_idx += 1 + video_idx += 1 + remain_videos -= 1 + remain_audios -= 1 + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + if llm_positions.shape[1] != seq_len: + raise RuntimeError("Position ids length mismatch with input ids length") + + mrope_position_delta = llm_positions.max() + 1 - seq_len + return llm_positions, mrope_position_delta diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 74262f8b94a68..e517109e94dd6 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -499,3 +499,40 @@ def run_dp_sharded_mrope_vision_model( "Found unassigned embeddings" ) return out_embeddings + + +def get_llm_pos_ids_for_vision( + start_idx: int, + vision_idx: int, + spatial_merge_size: int, + t_index: list[int], + grid_hs: torch.Tensor, + grid_ws: torch.Tensor, +) -> torch.Tensor: + llm_pos_ids_list = [] + llm_grid_h = grid_hs[vision_idx] // spatial_merge_size + llm_grid_w = grid_ws[vision_idx] // spatial_merge_size + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(len(t_index), -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(len(t_index), llm_grid_h, -1) + .flatten() + ) + t_index_tensor = ( + torch.Tensor(t_index) + .to(llm_grid_h.device) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + .long() + .flatten() + ) + _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index]) + llm_pos_ids_list.append(_llm_pos_ids + start_idx) + llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1) + return llm_pos_ids diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2dce58237c7b0..a323835e575cc 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -875,7 +875,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if mm_input.get("use_audio_in_video") is True: use_audio_in_video = True - if supports_mrope(self.model): + if supports_mrope(self.get_model()): req_state.mrope_positions, req_state.mrope_position_delta = ( self.model.get_mrope_input_positions( req_state.prompt_token_ids, From 55392bc87932da63b888e58f371fe4b67b438496 Mon Sep 17 00:00:00 2001 From: "sangho.lee" Date: Sat, 11 Oct 2025 00:28:23 -0500 Subject: [PATCH 05/30] [Bugfix][Multi Modal] Fix incorrect Molmo image processing (#26563) Signed-off-by: sanghol --- vllm/model_executor/models/molmo.py | 40 +++++++++++++++++------------ 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 734841d0dc983..f1dd06f3a0650 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -114,11 +114,11 @@ class MolmoImageInputs(TensorSchema): TensorShape("bn", "nc", "np", dynamic_dims={"nc"}), ] - feat_is_patch: Annotated[ + image_input_idx: Annotated[ Union[torch.Tensor, list[torch.Tensor]], TensorShape("bn", "nc", "tp", dynamic_dims={"nc"}), ] - # A boolean mask indicating which image features correspond to patch tokens. + # An index tensor that maps image features to their corresponding patch tokens. num_crops: Annotated[torch.Tensor, TensorShape("bn")] @@ -1177,7 +1177,7 @@ class MolmoProcessorWrapper: num_crops = torch.tensor(tilings).prod(-1) + 1 assert num_crops.sum() == len(feat_is_patch) - outputs["feat_is_patch"] = feat_is_patch + outputs["image_input_idx"] = image_input_idx outputs["num_crops"] = num_crops outputs["img_patch_id"] = self.image_patch_id @@ -1211,8 +1211,9 @@ class MolmoProcessingInfo(BaseProcessingInfo): image_token_length_w = processor.image_token_length_w image_token_length_h = processor.image_token_length_h - extra = image_token_length_w * image_token_length_h - joint = ((ncols + 1) // pooling_size) * ((nrows + 1) // pooling_size) + # Calculate total tokens: 2 for start/end + (w+1)*h for column separators + extra = 2 + (image_token_length_w + 1) * image_token_length_h + joint = 2 + ((ncols + 1) // pooling_size + 1) * ((nrows + 1) // pooling_size) return extra + joint @@ -1299,7 +1300,7 @@ class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]): return dict( images=MultiModalFieldConfig.flat_from_sizes("image", num_crops), image_masks=MultiModalFieldConfig.flat_from_sizes("image", num_crops), - feat_is_patch=MultiModalFieldConfig.flat_from_sizes("image", num_crops), + image_input_idx=MultiModalFieldConfig.flat_from_sizes("image", num_crops), num_crops=MultiModalFieldConfig.batched("image"), img_patch_id=MultiModalFieldConfig.shared("image", num_images), ) @@ -1444,7 +1445,7 @@ class MolmoForCausalLM( ) -> Optional[MolmoImageInputs]: images = kwargs.pop("images", None) image_masks = kwargs.pop("image_masks", None) - feat_is_patch = kwargs.pop("feat_is_patch", None) + image_input_idx = kwargs.pop("image_input_idx", None) num_crops = kwargs.pop("num_crops", None) if images is None: @@ -1466,7 +1467,7 @@ class MolmoForCausalLM( return MolmoImageInputs( images=images, image_masks=image_masks, - feat_is_patch=feat_is_patch, + image_input_idx=image_input_idx, num_crops=num_crops, ) @@ -1476,7 +1477,7 @@ class MolmoForCausalLM( ) -> list[torch.Tensor]: images = image_input["images"] image_masks = image_input["image_masks"] - feat_is_patch = image_input["feat_is_patch"] + image_input_idx = image_input["image_input_idx"] num_crops = image_input["num_crops"] # Call the vision backbone on the whole batch at once @@ -1484,7 +1485,7 @@ class MolmoForCausalLM( image_masks_flat = ( None if image_masks is None else flatten_bn(image_masks, concat=True) ) - feat_is_patch_flat = flatten_bn(feat_is_patch, concat=True) + image_input_idx_flat = flatten_bn(image_input_idx, concat=True) image_features_flat = self.vision_backbone( images=images_flat.unsqueeze(0), @@ -1494,13 +1495,18 @@ class MolmoForCausalLM( ).squeeze(0) # Only the features corresponding to patch tokens are relevant - return [ - feats[f_is_patch] - for feats, f_is_patch in zip( - image_features_flat.split(num_crops.tolist()), - feat_is_patch_flat.split(num_crops.tolist()), - ) - ] + # Re-order the features using the image_input_idx tensor + results = [] + num_crops_list = num_crops.tolist() + for feats, img_idx in zip( + image_features_flat.split(num_crops_list), + image_input_idx_flat.split(num_crops_list), + ): + is_valid = img_idx >= 0 + valid_img_idx = img_idx[is_valid] + order = torch.argsort(valid_img_idx) + results.append(feats[is_valid][order]) + return results def get_language_model(self) -> torch.nn.Module: return self.model From 727144bed10ffd465e37d47a5a60747efc15368b Mon Sep 17 00:00:00 2001 From: dsinghvi Date: Sat, 11 Oct 2025 12:51:04 +0530 Subject: [PATCH 06/30] [Refactor]: Use M-RoPE interface directly while defining model class instead of maintaining model specific M-RoPE implementation in mrope.py (#24172) Signed-off-by: Divyansh Singhvi Signed-off-by: dsinghvi Signed-off-by: DarkLight1337 Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: DarkLight1337 Co-authored-by: wwl2755 --- .../layers/rotary_embedding/mrope.py | 1015 ----------------- vllm/model_executor/models/ernie45_vl.py | 151 ++- vllm/model_executor/models/glm4v.py | 152 ++- vllm/model_executor/models/keye_vl1_5.py | 144 ++- .../models/qwen2_5_omni_thinker.py | 271 ++++- vllm/model_executor/models/qwen2_5_vl.py | 130 ++- vllm/model_executor/models/qwen3_vl.py | 115 +- vllm/model_executor/models/utils.py | 8 + vllm/v1/worker/gpu_model_runner.py | 39 +- 9 files changed, 974 insertions(+), 1051 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index ebfe9257c6c45..fce110e6a5270 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import itertools from typing import Optional, Union import numpy as np @@ -411,969 +410,6 @@ class MRotaryEmbedding(RotaryEmbedding): return llm_positions.tolist(), mrope_position_delta - @classmethod - def get_input_positions_tensor( - cls, - input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - second_per_grid_ts: list[float], - context_len: int = 0, - seq_len: Optional[int] = None, - audio_feature_lengths: Optional[torch.Tensor] = None, - use_audio_in_video: bool = False, - ) -> tuple[torch.Tensor, int]: - from vllm.transformers_utils.config import thinker_uses_mrope - - if thinker_uses_mrope(hf_config) and hf_config.model_type == "qwen2_5_omni": - return cls._omni_get_input_positions_tensor( - input_tokens=input_tokens, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=context_len, - seq_len=seq_len, - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) - elif hf_config.model_type in ["glm4v", "glm4v_moe"]: - return cls._glm4v_get_input_positions_tensor( - input_tokens=input_tokens, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - context_len=context_len, - seq_len=seq_len, - ) - elif hf_config.model_type in ["qwen3_vl", "qwen3_vl_moe"]: - return cls._qwen3vl_get_input_positions_tensor( - input_tokens=input_tokens, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - context_len=context_len, - seq_len=seq_len, - ) - elif hf_config.model_type in ["ernie4_5_moe_vl", "ernie4_5_vl"]: - return cls._ernie_get_input_positions_tensor( - input_tokens=input_tokens, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - context_len=context_len, - seq_len=seq_len, - ) - elif "KeyeVL1_5" in hf_config.model_type: - return cls._keye_get_input_positions_tensor( - input_tokens=input_tokens, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - context_len=context_len, - seq_len=seq_len, - ) - else: - return cls._vl_get_input_positions_tensor( - input_tokens=input_tokens, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=context_len, - seq_len=seq_len, - ) - - @classmethod - def _glm4v_get_input_positions_tensor( - cls, - input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - context_len: int = 0, - seq_len: Optional[int] = None, - ) -> tuple[torch.Tensor, int]: - """Get mrope input positions and delta value for GLM4V.""" - - image_token_id = hf_config.image_token_id - video_start_token_id = hf_config.video_start_token_id - video_end_token_id = hf_config.video_end_token_id - spatial_merge_size = hf_config.vision_config.spatial_merge_size - llm_pos_ids_list: list = [] - - if not (image_grid_thw is None and video_grid_thw is None): - if isinstance(image_grid_thw, torch.Tensor): - image_grid_thw = image_grid_thw.tolist() - - input_token_type: list[str] = [] - video_check_flg = False - for token in input_tokens: - if token == video_start_token_id: - video_check_flg = True - elif token == video_end_token_id: - video_check_flg = False - - if (token == image_token_id) and (video_check_flg is False): - input_token_type.append("image") - elif (token == image_token_id) and (video_check_flg is True): - input_token_type.append("video") - else: - input_token_type.append("text") - - input_type_group: list[tuple[str, int, int]] = [] - for key, group_iter in itertools.groupby( - enumerate(input_token_type), lambda x: x[1] - ): - group_list = list(group_iter) - start_index = group_list[0][0] - end_index = group_list[-1][0] + 1 - input_type_group.append((key, start_index, end_index)) - - video_frame_num = 1 - mm_data_idx = 0 - for modality_type, start_idx, end_idx in input_type_group: - st_idx = ( - llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 - ) - if modality_type == "image": - t, h, w = ( - image_grid_thw[mm_data_idx][0], - image_grid_thw[mm_data_idx][1], - image_grid_thw[mm_data_idx][2], - ) - llm_grid_t, llm_grid_h, llm_grid_w = ( - t, - h // spatial_merge_size, - w // spatial_merge_size, - ) - - t_index = ( - torch.arange(llm_grid_t) - .view(-1, 1) - .expand(-1, llm_grid_h * llm_grid_w) - .flatten() - ) - h_index = ( - torch.arange(llm_grid_h) - .view(1, -1, 1) - .expand(llm_grid_t, -1, llm_grid_w) - .flatten() - ) - w_index = ( - torch.arange(llm_grid_w) - .view(1, 1, -1) - .expand(llm_grid_t, llm_grid_h, -1) - .flatten() - ) - llm_pos_ids_list.append( - torch.stack([t_index, h_index, w_index]) + st_idx - ) - mm_data_idx += 1 - - elif modality_type == "video": - t, h, w = ( - video_frame_num, - image_grid_thw[mm_data_idx][1], - image_grid_thw[mm_data_idx][2], - ) - llm_grid_t, llm_grid_h, llm_grid_w = ( - t, - h // spatial_merge_size, - w // spatial_merge_size, - ) - - for t_idx in range(llm_grid_t): - t_index = ( - torch.tensor(t_idx) - .view(-1, 1) - .expand(-1, llm_grid_h * llm_grid_w) - .flatten() - ) - h_index = ( - torch.arange(llm_grid_h) - .view(1, -1, 1) - .expand(1, -1, llm_grid_w) - .flatten() - ) - w_index = ( - torch.arange(llm_grid_w) - .view(1, 1, -1) - .expand(1, llm_grid_h, -1) - .flatten() - ) - llm_pos_ids_list.append( - torch.stack([t_index, h_index, w_index]) + st_idx - ) - - mm_data_idx += 1 - video_frame_num += 1 - - else: - text_len = end_idx - start_idx - llm_pos_ids_list.append( - torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx - ) - video_frame_num = 1 - - else: - text_len = len(input_tokens) - llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1)) - - llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - llm_positions = llm_positions[:, context_len:seq_len] - mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - return llm_positions, mrope_position_delta - - @classmethod - def _qwen3vl_get_input_positions_tensor( - cls, - input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - context_len: int = 0, - seq_len: Optional[int] = None, - ) -> tuple[torch.Tensor, int]: - """Get mrope input positions and delta value.""" - - video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)] - - image_token_id = hf_config.image_token_id - video_token_id = hf_config.video_token_id - vision_start_token_id = hf_config.vision_start_token_id - spatial_merge_size = hf_config.vision_config.spatial_merge_size - - input_tokens_tensor = torch.tensor(input_tokens) - vision_start_indices = torch.argwhere( - input_tokens_tensor == vision_start_token_id - ).squeeze(1) - vision_tokens = input_tokens_tensor[vision_start_indices + 1] - image_nums = (vision_tokens == image_token_id).sum() - video_nums = (vision_tokens == video_token_id).sum() - llm_pos_ids_list: list = [] - - st = 0 - remain_images, remain_videos = image_nums, video_nums - - image_index, video_index = 0, 0 - for _ in range(image_nums + video_nums): - if image_token_id in input_tokens and remain_images > 0: - ed_image = input_tokens.index(image_token_id, st) - else: - ed_image = len(input_tokens) + 1 - if video_token_id in input_tokens and remain_videos > 0: - ed_video = input_tokens.index(video_token_id, st) - else: - ed_video = len(input_tokens) + 1 - if ed_image < ed_video: - t, h, w = ( - image_grid_thw[image_index][0], - image_grid_thw[image_index][1], - image_grid_thw[image_index][2], - ) - image_index += 1 - remain_images -= 1 - ed = ed_image - else: - t, h, w = ( - video_grid_thw[video_index][0], - video_grid_thw[video_index][1], - video_grid_thw[video_index][2], - ) - video_index += 1 - remain_videos -= 1 - ed = ed_video - - llm_grid_t, llm_grid_h, llm_grid_w = ( - t, - h // spatial_merge_size, - w // spatial_merge_size, - ) - text_len = ed - st - - st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 - llm_pos_ids_list.append( - torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx - ) - - t_index = ( - torch.arange(llm_grid_t) - .view(-1, 1) - .expand(-1, llm_grid_h * llm_grid_w) - .flatten() - ) - h_index = ( - torch.arange(llm_grid_h) - .view(1, -1, 1) - .expand(llm_grid_t, -1, llm_grid_w) - .flatten() - ) - w_index = ( - torch.arange(llm_grid_w) - .view(1, 1, -1) - .expand(llm_grid_t, llm_grid_h, -1) - .flatten() - ) - llm_pos_ids_list.append( - torch.stack([t_index, h_index, w_index]) + text_len + st_idx - ) - st = ed + llm_grid_t * llm_grid_h * llm_grid_w - - if st < len(input_tokens): - st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 - text_len = len(input_tokens) - st - llm_pos_ids_list.append( - torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx - ) - - llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - llm_positions = llm_positions[:, context_len:seq_len] - return llm_positions, mrope_position_delta - - @classmethod - def _ernie_get_input_positions_tensor( - cls, - input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - context_len: int = 0, - seq_len: Optional[int] = None, - ) -> tuple[torch.Tensor, int]: - """Get mrope input positions and delta value for Ernie VL.""" - - image_token_id = hf_config.im_patch_id - video_start_token_id = hf_config.video_start_token_id - video_end_token_id = hf_config.video_end_token_id - spatial_conv_size = hf_config.spatial_conv_size - temporal_conv_size = hf_config.temporal_conv_size - llm_pos_ids_list: list = [] - - if not (image_grid_thw is None and video_grid_thw is None): - if isinstance(image_grid_thw, torch.Tensor): - image_grid_thw = image_grid_thw.tolist() - - input_token_type: list[str] = [] - video_check_flg = False - for token in input_tokens: - if token == video_start_token_id: - video_check_flg = True - elif token == video_end_token_id: - video_check_flg = False - - if (token == image_token_id) and (video_check_flg is False): - input_token_type.append("image") - elif (token == image_token_id) and (video_check_flg is True): - input_token_type.append("video") - else: - input_token_type.append("text") - - input_type_group: list[tuple[str, int, int]] = [] - for key, group_iter in itertools.groupby( - enumerate(input_token_type), lambda x: x[1] - ): - group_list = list(group_iter) - start_index = group_list[0][0] - end_index = group_list[-1][0] + 1 - input_type_group.append((key, start_index, end_index)) - - video_frame_num = 1 - mm_data_idx = 0 - for modality_type, start_idx, end_idx in input_type_group: - st_idx = ( - llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 - ) - if modality_type == "image": - t, h, w = ( - image_grid_thw[mm_data_idx][0], - image_grid_thw[mm_data_idx][1], - image_grid_thw[mm_data_idx][2], - ) - llm_grid_t, llm_grid_h, llm_grid_w = ( - t, - h // spatial_conv_size, - w // spatial_conv_size, - ) - - t_index = ( - torch.arange(llm_grid_t) - .view(-1, 1) - .expand(-1, llm_grid_h * llm_grid_w) - .flatten() - ) - h_index = ( - torch.arange(llm_grid_h) - .view(1, -1, 1) - .expand(llm_grid_t, -1, llm_grid_w) - .flatten() - ) - w_index = ( - torch.arange(llm_grid_w) - .view(1, 1, -1) - .expand(llm_grid_t, llm_grid_h, -1) - .flatten() - ) - llm_pos_ids_list.append( - torch.stack([t_index, h_index, w_index]) + st_idx - ) - mm_data_idx += 1 - - elif modality_type == "video": - t, h, w = ( - video_grid_thw[mm_data_idx][0], - video_grid_thw[mm_data_idx][1], - video_grid_thw[mm_data_idx][2], - ) - llm_grid_t, llm_grid_h, llm_grid_w = ( - t // temporal_conv_size, - h // spatial_conv_size, - w // spatial_conv_size, - ) - - for t_idx in range(llm_grid_t): - t_index = ( - torch.tensor(t_idx) - .view(-1, 1) - .expand(-1, llm_grid_h * llm_grid_w) - .flatten() - ) - h_index = ( - torch.arange(llm_grid_h) - .view(1, -1, 1) - .expand(1, -1, llm_grid_w) - .flatten() - ) - w_index = ( - torch.arange(llm_grid_w) - .view(1, 1, -1) - .expand(1, llm_grid_h, -1) - .flatten() - ) - llm_pos_ids_list.append( - torch.stack([t_index, h_index, w_index]) + st_idx - ) - - mm_data_idx += 1 - video_frame_num += 1 - - else: - text_len = end_idx - start_idx - llm_pos_ids_list.append( - torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx - ) - video_frame_num = 1 - - else: - text_len = len(input_tokens) - llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1)) - - llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - llm_positions = llm_positions[:, context_len:seq_len] - mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - return llm_positions, mrope_position_delta - - @classmethod - def _keye_get_input_positions_tensor( - cls, - input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - context_len: int = 0, - seq_len: Optional[int] = None, - ) -> tuple[torch.Tensor, int]: - if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0: - video_grid_thw = video_grid_thw[0] - """Get mrope input positions and delta value (Keye series).""" - - def split_thw(grid_thw: Union[torch.Tensor, list[int]]) -> list[list[int]]: - """ - Split grid_thw along the t dimension. - - Args: - grid_thw: shape [N, 3] tensor or nested list of [t, h, w]. - - Returns: - List of [1, h, w] rows, repeated t times for each original row. - """ - - if isinstance(grid_thw, list): - grid_thw = torch.tensor(grid_thw, dtype=torch.long) - - if grid_thw.numel() == 0: - return [] - - t, hw = grid_thw[:, 0], grid_thw[:, 1:] - ones = torch.ones_like(hw[:, :1]) # [N,1] - out = torch.cat([ones, hw], dim=1).repeat_interleave(t, dim=0) - return out.tolist() - - video_grid_thw = split_thw(video_grid_thw) - - image_token_id = hf_config.image_token_id - video_token_id = hf_config.video_token_id - spatial_merge_size = hf_config.vision_config.spatial_merge_size - - image_nums = len(image_grid_thw) - frame_nums = len(video_grid_thw) - llm_pos_ids_list: list = [] - - st = 0 - remain_images, remain_frames = image_nums, frame_nums - - image_index, video_index = 0, 0 - for _ in range(image_nums + frame_nums): - if remain_images > 0: - try: - ed_image = input_tokens.index(image_token_id, st) - except ValueError: - ed_image = len(input_tokens) + 1 - else: - ed_image = len(input_tokens) + 1 - if remain_frames > 0: - try: - ed_video = input_tokens.index(video_token_id, st) - except ValueError: - ed_video = len(input_tokens) + 1 - else: - ed_video = len(input_tokens) + 1 - - if ed_image < ed_video: - t, h, w = ( - image_grid_thw[image_index][0], - image_grid_thw[image_index][1], - image_grid_thw[image_index][2], - ) - image_index += 1 - remain_images -= 1 - ed = ed_image - else: - t, h, w = ( - video_grid_thw[video_index][0], - video_grid_thw[video_index][1], - video_grid_thw[video_index][2], - ) - video_index += 1 - remain_frames -= 1 - ed = ed_video - - llm_grid_t, llm_grid_h, llm_grid_w = ( - t, - h // spatial_merge_size, - w // spatial_merge_size, - ) - text_len = ed - st - - st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 - llm_pos_ids_list.append( - torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx - ) - t_index = ( - torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w) - ).flatten() - h_index = ( - torch.arange(llm_grid_h) - .view(1, -1, 1) - .expand(llm_grid_t, -1, llm_grid_w) - .flatten() - ) - w_index = ( - torch.arange(llm_grid_w) - .view(1, 1, -1) - .expand(llm_grid_t, llm_grid_h, -1) - .flatten() - ) - llm_pos_ids_list.append( - torch.stack([t_index, h_index, w_index]) + text_len + st_idx - ) - st = ed + llm_grid_t * llm_grid_h * llm_grid_w - - if st < len(input_tokens): - st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 - text_len = len(input_tokens) - st - llm_pos_ids_list.append( - torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx - ) - - llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - llm_positions = llm_positions[:, context_len:seq_len] - - return llm_positions, mrope_position_delta - - @classmethod - def _vl_get_input_positions_tensor( - cls, - input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - second_per_grid_ts: list[float], - context_len: int = 0, - seq_len: Optional[int] = None, - ) -> tuple[torch.Tensor, int]: - """Get mrope input positions and delta value.""" - - image_token_id = hf_config.image_token_id - video_token_id = hf_config.video_token_id - vision_start_token_id = hf_config.vision_start_token_id - spatial_merge_size = hf_config.vision_config.spatial_merge_size - tokens_per_second = getattr(hf_config.vision_config, "tokens_per_second", 1.0) - - input_tokens_tensor = torch.tensor(input_tokens) - vision_start_indices = torch.argwhere( - input_tokens_tensor == vision_start_token_id - ).squeeze(1) - vision_tokens = input_tokens_tensor[vision_start_indices + 1] - image_nums = (vision_tokens == image_token_id).sum() - video_nums = (vision_tokens == video_token_id).sum() - llm_pos_ids_list: list = [] - - st = 0 - remain_images, remain_videos = image_nums, video_nums - image_index, video_index = 0, 0 - for _ in range(image_nums + video_nums): - video_second_per_grid_t = 0.0 - if remain_images > 0: - try: - ed_image = input_tokens.index(image_token_id, st) - except ValueError: - ed_image = len(input_tokens) + 1 - else: - ed_image = len(input_tokens) + 1 - if remain_videos > 0: - try: - ed_video = input_tokens.index(video_token_id, st) - except ValueError: - ed_video = len(input_tokens) + 1 - else: - ed_video = len(input_tokens) + 1 - if ed_image < ed_video: - t, h, w = ( - image_grid_thw[image_index][0], - image_grid_thw[image_index][1], - image_grid_thw[image_index][2], - ) - image_index += 1 - remain_images -= 1 - ed = ed_image - else: - t, h, w = ( - video_grid_thw[video_index][0], - video_grid_thw[video_index][1], - video_grid_thw[video_index][2], - ) - video_second_per_grid_t = 1.0 - if second_per_grid_ts: - video_second_per_grid_t = second_per_grid_ts[video_index] - video_index += 1 - remain_videos -= 1 - ed = ed_video - - llm_grid_t, llm_grid_h, llm_grid_w = ( - t, - h // spatial_merge_size, - w // spatial_merge_size, - ) - text_len = ed - st - - st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 - llm_pos_ids_list.append( - torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx - ) - t_index = ( - torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w) - * video_second_per_grid_t - * tokens_per_second - ).flatten() - h_index = ( - torch.arange(llm_grid_h) - .view(1, -1, 1) - .expand(llm_grid_t, -1, llm_grid_w) - .flatten() - ) - w_index = ( - torch.arange(llm_grid_w) - .view(1, 1, -1) - .expand(llm_grid_t, llm_grid_h, -1) - .flatten() - ) - llm_pos_ids_list.append( - torch.stack([t_index, h_index, w_index]) + text_len + st_idx - ) - st = ed + llm_grid_t * llm_grid_h * llm_grid_w - - if st < len(input_tokens): - st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 - text_len = len(input_tokens) - st - llm_pos_ids_list.append( - torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx - ) - - llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - llm_positions = llm_positions[:, context_len:seq_len] - - return llm_positions, mrope_position_delta - - @classmethod - def _omni_get_input_positions_tensor( - cls, - input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: Union[list[list[int]], torch.Tensor], - video_grid_thw: Union[list[list[int]], torch.Tensor], - second_per_grid_ts: Optional[list[float]] = None, - context_len: int = 0, - seq_len: Optional[int] = None, - audio_feature_lengths: Optional[torch.Tensor] = None, - use_audio_in_video: bool = False, - ) -> tuple[torch.Tensor, int]: - """Get mrope input positions and delta value (Qwen2.5-Omni version). - - Differences from MRotaryEmbedding: - 1. Add audio support (and related `audio_feature_lengths`). - 2. Add `use_audio_in_video` option to read audio from video inputs. - In this case, audio and vision position ids will be split into - chunks and interleaved. - - Example: - - (V_i are vision position ids, A_i are audio position ids) - - |V_1 ... V_n|A_1 ... A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|... - |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |... - """ - - # TODO(fyabc): refactor and share more code with - # _vl_get_input_positions_tensor. - thinker_config = hf_config.thinker_config - - if isinstance(image_grid_thw, list): - image_grid_thw = torch.tensor(image_grid_thw) - if isinstance(video_grid_thw, list): - video_grid_thw = torch.tensor(video_grid_thw) - - audio_token_id = thinker_config.audio_token_index - image_token_id = thinker_config.image_token_index - video_token_id = thinker_config.video_token_index - audio_start_token_id = thinker_config.audio_start_token_id - audio_end_token_id = thinker_config.audio_end_token_id - vision_start_token_id = thinker_config.vision_start_token_id - vision_end_token_id = thinker_config.vision_end_token_id - seconds_per_chunk = thinker_config.seconds_per_chunk - spatial_merge_size = thinker_config.vision_config.spatial_merge_size - tokens_per_second = getattr( - thinker_config.vision_config, "tokens_per_second", 25 - ) - - src_item = input_tokens - audio_seqlens = audio_feature_lengths - if not second_per_grid_ts: - second_per_grid_ts = [1] * video_grid_thw.shape[0] - audio_idx = 0 - video_idx = 0 - image_idx = 0 - new_src_item: list[int] = [] - llm_pos_ids_list: list[torch.Tensor] = [] - - idx = 0 - while idx < len(src_item): - new_src_item_len = len(new_src_item) - start_idx = ( - llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 - ) - if src_item[idx] not in [audio_token_id, video_token_id, image_token_id]: - if use_audio_in_video and idx > 0: - if ( - src_item[idx] == vision_end_token_id - and src_item[idx - 1] == audio_end_token_id - ): - # processing the <|audio_eos|> before <|vision_eos|> - start_idx -= 1 - elif ( - src_item[idx] == audio_start_token_id - and src_item[idx - 1] == vision_start_token_id - ): - # processing the <|audio_bos|> after <|vision_eos|> - start_idx -= 1 - new_src_item.append(src_item[idx]) - llm_pos_ids = torch.tensor([start_idx], dtype=torch.long).expand(3, -1) - llm_pos_ids_list.append(llm_pos_ids) - elif src_item[idx] == audio_token_id: - assert audio_seqlens is not None - audio_seqlen = audio_seqlens[audio_idx] - place_num = ((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1 - new_src_item.extend([audio_token_id] * place_num) - llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx - llm_pos_ids_list.append(llm_pos_ids) - audio_idx += 1 - elif src_item[idx] == image_token_id: - grid_t = image_grid_thw[image_idx][0] - grid_hs = image_grid_thw[:, 1] - grid_ws = image_grid_thw[:, 2] - t_index = torch.arange(grid_t) * 1 * tokens_per_second - llm_pos_ids = cls._get_llm_pos_ids_for_vision( - start_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws - ) - llm_pos_ids_list.append(llm_pos_ids) - vision_seqlen = image_grid_thw[image_idx].prod() // ( - spatial_merge_size**2 - ) - new_src_item.extend([image_token_id] * vision_seqlen) - image_idx += 1 - elif src_item[idx] == video_token_id and not use_audio_in_video: - grid_t = video_grid_thw[video_idx][0] - grid_hs = video_grid_thw[:, 1] - grid_ws = video_grid_thw[:, 2] - t_index = ( - torch.arange(grid_t) - * second_per_grid_ts[video_idx] - * tokens_per_second - ) - llm_pos_ids = cls._get_llm_pos_ids_for_vision( - start_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws - ) - llm_pos_ids_list.append(llm_pos_ids) - vision_seqlen = video_grid_thw[video_idx].prod() // ( - spatial_merge_size**2 - ) - new_src_item.extend([video_token_id] * vision_seqlen) - video_idx += 1 - else: - # read audio from video - assert audio_seqlens is not None - audio_seqlen = audio_seqlens[audio_idx] - vision_seqlen = video_grid_thw[video_idx].prod() // ( - spatial_merge_size**2 - ) - grid_t = video_grid_thw[video_idx][0] - grid_h = video_grid_thw[video_idx][1] - grid_w = video_grid_thw[video_idx][2] - grid_hs = video_grid_thw[:, 1] - grid_ws = video_grid_thw[:, 2] - t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk) - t_index = ( - torch.arange(grid_t) - * second_per_grid_ts[video_idx] - * tokens_per_second - ) - t_index_split_chunk = cls._split_list_into_ranges( - t_index, t_ntoken_per_chunk - ) - place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2 - pure_audio_len = place_num - 2 - added_audio_len = 0 - audio_llm_pos_ids_list: list[torch.Tensor] = [] - for t_chunk in t_index_split_chunk: - vision_ntoken_per_chunk = ( - len(t_chunk) * grid_h * grid_w // (spatial_merge_size**2) - ) - new_src_item.extend([video_token_id] * vision_ntoken_per_chunk) - vision_llm_pos_ids_list = cls._get_llm_pos_ids_for_vision( - start_idx, - video_idx, - spatial_merge_size, - t_chunk, - grid_hs, - grid_ws, - ).split(1, dim=1) - llm_pos_ids_list.extend(vision_llm_pos_ids_list) - new_src_item.extend( - min(t_ntoken_per_chunk, pure_audio_len - added_audio_len) - * [audio_token_id] - ) - audio_start_idx = ( - start_idx - if len(audio_llm_pos_ids_list) == 0 - else audio_llm_pos_ids_list[-1][0].item() + 1 - ) - if min(t_ntoken_per_chunk, pure_audio_len - added_audio_len) > 0: - audio_llm_pos_ids_list = ( - torch.arange( - min( - t_ntoken_per_chunk, pure_audio_len - added_audio_len - ) - ).expand(3, -1) - + audio_start_idx - ).split(1, dim=1) - else: - audio_llm_pos_ids_list = [] - added_audio_len += min( - t_ntoken_per_chunk, pure_audio_len - added_audio_len - ) - llm_pos_ids_list.extend(audio_llm_pos_ids_list) - if added_audio_len < pure_audio_len: - new_src_item.extend( - (pure_audio_len - added_audio_len) * [audio_token_id] - ) - audio_llm_pos_ids_list = ( - torch.arange(pure_audio_len - added_audio_len).expand(3, -1) - + llm_pos_ids_list[-1].max() - + 1 - ).split(1, dim=1) - llm_pos_ids_list.extend(audio_llm_pos_ids_list) - audio_idx += 1 - video_idx += 1 - # move to the next token - idx += len(new_src_item) - new_src_item_len - - llm_positions = torch.cat(llm_pos_ids_list, dim=1) - mrope_position_delta = ( - torch.cat(llm_pos_ids_list, dim=1).max() + 1 - len(src_item) - ) - llm_positions = llm_positions[:, context_len:seq_len] - - return llm_positions, mrope_position_delta - - @staticmethod - def _get_llm_pos_ids_for_vision( - start_idx: int, - vision_idx: int, - spatial_merge_size: int, - t_index: list[int], - grid_hs: torch.Tensor, - grid_ws: torch.Tensor, - ) -> torch.Tensor: - llm_pos_ids_list = [] - llm_grid_h = grid_hs[vision_idx] // spatial_merge_size - llm_grid_w = grid_ws[vision_idx] // spatial_merge_size - h_index = ( - torch.arange(llm_grid_h) - .view(1, -1, 1) - .expand(len(t_index), -1, llm_grid_w) - .flatten() - ) - w_index = ( - torch.arange(llm_grid_w) - .view(1, 1, -1) - .expand(len(t_index), llm_grid_h, -1) - .flatten() - ) - t_index_tensor = ( - torch.Tensor(t_index) - .to(llm_grid_h.device) - .view(-1, 1) - .expand(-1, llm_grid_h * llm_grid_w) - .long() - .flatten() - ) - _llm_pos_ids = torch.stack([t_index_tensor, h_index, w_index]) - llm_pos_ids_list.append(_llm_pos_ids + start_idx) - llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1) - return llm_pos_ids - - @staticmethod - def _split_list_into_ranges(lst: torch.Tensor, interval: int) -> list[list[int]]: - ranges: list[list[int]] = [[] for _ in range((max(lst) // interval) + 1)] - for num in lst: - index = num // interval - ranges[index].append(num) - return ranges - @staticmethod def get_next_input_positions( mrope_position_delta: int, @@ -1403,54 +439,3 @@ class MRotaryEmbedding(RotaryEmbedding): dtype=out.dtype, ) out[:, out_offset : out_offset + num_new_tokens] = values - - @classmethod - def omni_get_updates_use_audio_in_video( - cls, - thinker_config: PretrainedConfig, - audio_len: int, - video_grid_thw: Union[list[int], torch.Tensor], - video_second_per_grid_t: float, - ) -> list[int]: - """Get video prompt updates when `use_audio_in_video` is True. - - In this case, audio and vision update ids will be split into - chunks and interleaved (details in `_omni_get_input_positions_tensor`). - - <|video_bos|><|VIDEO|><|video_eos|> => - <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|> - """ - - audio_token_id = thinker_config.audio_token_index - video_token_id = thinker_config.video_token_index - audio_start_token_id = thinker_config.audio_start_token_id - audio_end_token_id = thinker_config.audio_end_token_id - seconds_per_chunk = thinker_config.seconds_per_chunk - spatial_merge_size = thinker_config.vision_config.spatial_merge_size - tokens_per_second = getattr( - thinker_config.vision_config, "tokens_per_second", 25 - ) - - grid_t = video_grid_thw[0] - grid_h = video_grid_thw[1] - grid_w = video_grid_thw[2] - t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk) - t_index = torch.arange(grid_t) * video_second_per_grid_t * tokens_per_second - t_index_split_chunk = cls._split_list_into_ranges(t_index, t_ntoken_per_chunk) - - updates = [audio_start_token_id] - added_audio_len = 0 - for t_chunk in t_index_split_chunk: - vision_ntoken_per_chunk = ( - len(t_chunk) * grid_h * grid_w // (spatial_merge_size**2) - ) - updates.extend([video_token_id] * vision_ntoken_per_chunk) - - audio_chunk_size = min(t_ntoken_per_chunk, audio_len - added_audio_len) - updates.extend(audio_chunk_size * [audio_token_id]) - added_audio_len += audio_chunk_size - if added_audio_len < audio_len: - updates.extend((audio_len - added_audio_len) * [audio_token_id]) - updates.extend([audio_end_token_id]) - - return updates diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 2579a0ebf53ef..d5b2caa2ddfd6 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -23,6 +23,7 @@ # limitations under the License. """Inference-only Erine VL model compatible with HuggingFace weights.""" +import itertools import math from collections.abc import Iterable, Mapping, Sequence from functools import partial @@ -33,7 +34,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from transformers import BatchFeature +from transformers import BatchFeature, PretrainedConfig from vllm.attention.backends.registry import _Backend from vllm.attention.layer import ( @@ -76,6 +77,7 @@ from .ernie45_vl_moe import Ernie4_5_VLMoeForCausalLM from .interfaces import ( MultiModalEmbeddings, SupportsLoRA, + SupportsMRoPE, SupportsMultiModal, SupportsPP, ) @@ -1271,7 +1273,7 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing dummy_inputs=Ernie4_5_VLDummyInputsBuilder, ) class Ernie4_5_VLMoeForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP + nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): merge_by_field_config = True @@ -1388,6 +1390,151 @@ class Ernie4_5_VLMoeForConditionalGeneration( else: self.visual_token_mask = None + @classmethod + def get_mrope_input_positions( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + context_len: int = 0, + seq_len: Optional[int] = None, + second_per_grid_ts: Optional[list[float]] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value for Ernie VL.""" + + image_token_id = hf_config.im_patch_id + video_start_token_id = hf_config.video_start_token_id + video_end_token_id = hf_config.video_end_token_id + spatial_conv_size = hf_config.spatial_conv_size + temporal_conv_size = hf_config.temporal_conv_size + llm_pos_ids_list: list = [] + + if not (image_grid_thw is None and video_grid_thw is None): + if isinstance(image_grid_thw, torch.Tensor): + image_grid_thw = image_grid_thw.tolist() + + input_token_type: list[str] = [] + video_check_flg = False + for token in input_tokens: + if token == video_start_token_id: + video_check_flg = True + elif token == video_end_token_id: + video_check_flg = False + + if (token == image_token_id) and (video_check_flg is False): + input_token_type.append("image") + elif (token == image_token_id) and (video_check_flg is True): + input_token_type.append("video") + else: + input_token_type.append("text") + + input_type_group: list[tuple[str, int, int]] = [] + for key, group_iter in itertools.groupby( + enumerate(input_token_type), lambda x: x[1] + ): + group_list = list(group_iter) + start_index = group_list[0][0] + end_index = group_list[-1][0] + 1 + input_type_group.append((key, start_index, end_index)) + + video_frame_num = 1 + mm_data_idx = 0 + for modality_type, start_idx, end_idx in input_type_group: + st_idx = ( + llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + ) + if modality_type == "image": + t, h, w = ( + image_grid_thw[mm_data_idx][0], + image_grid_thw[mm_data_idx][1], + image_grid_thw[mm_data_idx][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h // spatial_conv_size, + w // spatial_conv_size, + ) + + t_index = ( + torch.arange(llm_grid_t) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + .flatten() + ) + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + st_idx + ) + mm_data_idx += 1 + + elif modality_type == "video": + t, h, w = ( + video_grid_thw[mm_data_idx][0], + video_grid_thw[mm_data_idx][1], + video_grid_thw[mm_data_idx][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = ( + t // temporal_conv_size, + h // spatial_conv_size, + w // spatial_conv_size, + ) + + for t_idx in range(llm_grid_t): + t_index = ( + torch.tensor(t_idx) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + .flatten() + ) + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(1, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(1, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + st_idx + ) + + mm_data_idx += 1 + video_frame_num += 1 + + else: + text_len = end_idx - start_idx + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + video_frame_num = 1 + + else: + text_len = len(input_tokens) + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1)) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + llm_positions = llm_positions[:, context_len:seq_len] + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + return llm_positions, mrope_position_delta + def get_language_model(self) -> torch.nn.Module: return self.language_model diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index a5c3ce0e6bf74..63731b2947d2d 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -5,6 +5,7 @@ # https://github.com/zai-org/CogAgent """Inference-only CogAgent model compatible with THUDM weights.""" +import itertools from argparse import Namespace from collections.abc import Mapping, Sequence from typing import Annotated, Literal, Optional, Union @@ -14,7 +15,7 @@ from torch import nn from torch.nn import LayerNorm from torchvision import transforms from torchvision.transforms import InterpolationMode -from transformers import BatchFeature, PreTrainedTokenizer, TensorType +from transformers import BatchFeature, PretrainedConfig, PreTrainedTokenizer, TensorType from transformers.image_utils import ImageInput from transformers.tokenization_utils_base import TextInput @@ -54,6 +55,7 @@ from .chatglm import ChatGLMBaseModel, ChatGLMModel from .interfaces import ( MultiModalEmbeddings, SupportsLoRA, + SupportsMRoPE, SupportsMultiModal, SupportsPP, ) @@ -554,7 +556,9 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]): info=GLM4VProcessingInfo, dummy_inputs=GLM4VDummyInputsBuilder, ) -class GLM4VForCausalLM(ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP): +class GLM4VForCausalLM( + ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE +): merge_by_field_config = True packed_modules_mapping = { @@ -615,6 +619,150 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, Suppo return self.transformer.vision(pixel_values) + @classmethod + def get_mrope_input_positions( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + context_len: int = 0, + seq_len: Optional[int] = None, + second_per_grid_ts: Optional[list[float]] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value for GLM4V.""" + + image_token_id = hf_config.image_token_id + video_start_token_id = hf_config.video_start_token_id + video_end_token_id = hf_config.video_end_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + llm_pos_ids_list: list = [] + + if not (image_grid_thw is None and video_grid_thw is None): + if isinstance(image_grid_thw, torch.Tensor): + image_grid_thw = image_grid_thw.tolist() + + input_token_type: list[str] = [] + video_check_flg = False + for token in input_tokens: + if token == video_start_token_id: + video_check_flg = True + elif token == video_end_token_id: + video_check_flg = False + + if (token == image_token_id) and (video_check_flg is False): + input_token_type.append("image") + elif (token == image_token_id) and (video_check_flg is True): + input_token_type.append("video") + else: + input_token_type.append("text") + + input_type_group: list[tuple[str, int, int]] = [] + for key, group_iter in itertools.groupby( + enumerate(input_token_type), lambda x: x[1] + ): + group_list = list(group_iter) + start_index = group_list[0][0] + end_index = group_list[-1][0] + 1 + input_type_group.append((key, start_index, end_index)) + + video_frame_num = 1 + mm_data_idx = 0 + for modality_type, start_idx, end_idx in input_type_group: + st_idx = ( + llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + ) + if modality_type == "image": + t, h, w = ( + image_grid_thw[mm_data_idx][0], + image_grid_thw[mm_data_idx][1], + image_grid_thw[mm_data_idx][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h // spatial_merge_size, + w // spatial_merge_size, + ) + + t_index = ( + torch.arange(llm_grid_t) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + .flatten() + ) + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + st_idx + ) + mm_data_idx += 1 + + elif modality_type == "video": + t, h, w = ( + video_frame_num, + image_grid_thw[mm_data_idx][1], + image_grid_thw[mm_data_idx][2], + ) + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h // spatial_merge_size, + w // spatial_merge_size, + ) + + for t_idx in range(llm_grid_t): + t_index = ( + torch.tensor(t_idx) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + .flatten() + ) + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(1, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(1, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + st_idx + ) + + mm_data_idx += 1 + video_frame_num += 1 + + else: + text_len = end_idx - start_idx + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + video_frame_num = 1 + + else: + text_len = len(input_tokens) + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1)) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + llm_positions = llm_positions[:, context_len:seq_len] + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + return llm_positions, mrope_position_delta + def get_language_model(self) -> torch.nn.Module: return self.transformer diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py index 578436fcad219..21d8099b43d16 100644 --- a/vllm/model_executor/models/keye_vl1_5.py +++ b/vllm/model_executor/models/keye_vl1_5.py @@ -38,7 +38,7 @@ from vllm.multimodal.processing import ( ) from vllm.utils.tensor_schema import TensorSchema, TensorShape -from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP +from .interfaces import SupportsLoRA, SupportsMRoPE, SupportsMultiModal, SupportsPP from .keye import ( BaseKeyeModule, BaseMultiModalProcessor, @@ -493,7 +493,7 @@ class KeyeVL1_5DummyInputsBuilder( dummy_inputs=KeyeVL1_5DummyInputsBuilder, ) class KeyeVL1_5ForConditionalGeneration( - BaseKeyeModule, SupportsMultiModal, SupportsLoRA, SupportsPP + BaseKeyeModule, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): def _build_projector( self, @@ -589,3 +589,143 @@ class KeyeVL1_5ForConditionalGeneration( end = patch_cu_seqlens[idx + 1] new_video_embeds.append(video_embeds[start:end]) return tuple(new_video_embeds) + + @classmethod + def get_mrope_input_positions( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + context_len: int = 0, + seq_len: Optional[int] = None, + second_per_grid_ts: Optional[list[float]] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0: + video_grid_thw = video_grid_thw[0] + """Get mrope input positions and delta value (Keye series).""" + + def split_thw(grid_thw: Union[torch.Tensor, list[int]]) -> list[list[int]]: + """ + Split grid_thw along the t dimension. + + Args: + grid_thw: shape [N, 3] tensor or nested list of [t, h, w]. + + Returns: + List of [1, h, w] rows, repeated t times for each original row. + """ + + if isinstance(grid_thw, list): + grid_thw = torch.tensor(grid_thw, dtype=torch.long) + + if grid_thw.numel() == 0: + return [] + + t, hw = grid_thw[:, 0], grid_thw[:, 1:] + ones = torch.ones_like(hw[:, :1]) # [N,1] + out = torch.cat([ones, hw], dim=1).repeat_interleave(t, dim=0) + return out.tolist() + + video_grid_thw = split_thw(video_grid_thw) + + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + + image_nums = len(image_grid_thw) + frame_nums = len(video_grid_thw) + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_frames = image_nums, frame_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + frame_nums): + if remain_images > 0: + try: + ed_image = input_tokens.index(image_token_id, st) + except ValueError: + ed_image = len(input_tokens) + 1 + else: + ed_image = len(input_tokens) + 1 + if remain_frames > 0: + try: + ed_video = input_tokens.index(video_token_id, st) + except ValueError: + ed_video = len(input_tokens) + 1 + else: + ed_video = len(input_tokens) + 1 + + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_index += 1 + remain_frames -= 1 + ed = ed_video + + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h // spatial_merge_size, + w // spatial_merge_size, + ) + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + t_index = ( + ( + torch.arange(llm_grid_t) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + ) + .long() + .flatten() + ) + + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx + ) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + llm_positions = llm_positions[:, context_len:seq_len] + + return llm_positions, mrope_position_delta diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 1ab2f43c9d736..0df79fc733f3f 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -29,6 +29,7 @@ from typing import Annotated, Any, Callable, Literal, Optional, Union import torch import torch.nn as nn +from transformers import PretrainedConfig from transformers.feature_extraction_utils import BatchFeature from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import ( Qwen2_5OmniConfig, @@ -45,7 +46,6 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger -from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen2_5_vl import ( Qwen2_5_VisionTransformer, @@ -93,6 +93,7 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( MultiModalEmbeddings, SupportsLoRA, + SupportsMRoPE, SupportsMultiModal, SupportsPP, ) @@ -101,7 +102,9 @@ from .utils import ( WeightsMapper, init_vllm_registered_model, maybe_prefix, + split_list_into_ranges, ) +from .vision import get_llm_pos_ids_for_vision try: import flash_attn @@ -412,6 +415,59 @@ class Qwen2_5OmniThinkerMultiModalProcessor( return prompt_ids, mm_placeholders + @classmethod + def omni_get_updates_use_audio_in_video( + cls, + thinker_config: PretrainedConfig, + audio_len: int, + video_grid_thw: Union[list[int], torch.Tensor], + video_second_per_grid_t: float, + ) -> list[int]: + """Get video prompt updates when `use_audio_in_video` is True. + + In this case, audio and vision update ids will be split into + chunks and interleaved (details in `_omni_get_input_positions_tensor`). + + <|video_bos|><|VIDEO|><|video_eos|> => + <|video_bos|><|audio_bos|>(... chunks ...)<|audio_eos|><|video_eos|> + """ + + audio_token_id = thinker_config.audio_token_index + video_token_id = thinker_config.video_token_index + audio_start_token_id = thinker_config.audio_start_token_id + audio_end_token_id = thinker_config.audio_end_token_id + seconds_per_chunk = thinker_config.seconds_per_chunk + spatial_merge_size = thinker_config.vision_config.spatial_merge_size + tokens_per_second = getattr( + thinker_config.vision_config, "tokens_per_second", 25 + ) + + grid_t = video_grid_thw[0] + grid_h = video_grid_thw[1] + grid_w = video_grid_thw[2] + t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk) + t_index = ( + torch.arange(grid_t) * video_second_per_grid_t * tokens_per_second + ).long() + t_index_split_chunk = split_list_into_ranges(t_index, t_ntoken_per_chunk) + + updates = [audio_start_token_id] + added_audio_len = 0 + for t_chunk in t_index_split_chunk: + vision_ntoken_per_chunk = ( + len(t_chunk) * grid_h * grid_w // (spatial_merge_size**2) + ) + updates.extend([video_token_id] * vision_ntoken_per_chunk) + + audio_chunk_size = min(t_ntoken_per_chunk, audio_len - added_audio_len) + updates.extend(audio_chunk_size * [audio_token_id]) + added_audio_len += audio_chunk_size + if added_audio_len < audio_len: + updates.extend((audio_len - added_audio_len) * [audio_token_id]) + updates.extend([audio_end_token_id]) + + return updates + def _get_prompt_updates( self, mm_items: MultiModalDataItems, @@ -491,7 +547,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor( else: video_second_per_grid_t = 1.0 - return MRotaryEmbedding.omni_get_updates_use_audio_in_video( + return self.omni_get_updates_use_audio_in_video( thinker_config=thinker_config, audio_len=audio_num_features, video_grid_thw=video_grid_thw, @@ -808,6 +864,7 @@ class Qwen2_5OmniThinkerForConditionalGeneration( SupportsMultiModal, SupportsPP, SupportsLoRA, + SupportsMRoPE, Qwen2_5OmniConditionalGenerationMixin, ): hf_to_vllm_mapper = WeightsMapper( @@ -929,6 +986,216 @@ class Qwen2_5OmniThinkerForConditionalGeneration( def get_language_model(self) -> torch.nn.Module: return self.language_model + @classmethod + def get_mrope_input_positions( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + second_per_grid_ts: Optional[list[float]] = None, + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value (Qwen2.5-Omni version). + + Differences from MRotaryEmbedding: + 1. Add audio support (and related `audio_feature_lengths`). + 2. Add `use_audio_in_video` option to read audio from video inputs. + In this case, audio and vision position ids will be split into + chunks and interleaved. + + Example: + + (V_i are vision position ids, A_i are audio position ids) + + |V_1 ... V_n|A_1 ... A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|... + |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |... + """ + + # TODO(fyabc): refactor and share more code with + # _vl_get_input_positions_tensor. + + thinker_config = hf_config.thinker_config + audio_token_id = thinker_config.audio_token_index + image_token_id = thinker_config.image_token_index + video_token_id = thinker_config.video_token_index + audio_start_token_id = thinker_config.audio_start_token_id + audio_end_token_id = thinker_config.audio_end_token_id + vision_start_token_id = thinker_config.vision_start_token_id + vision_end_token_id = thinker_config.vision_end_token_id + seconds_per_chunk = thinker_config.seconds_per_chunk + spatial_merge_size = thinker_config.vision_config.spatial_merge_size + tokens_per_second = getattr( + thinker_config.vision_config, "tokens_per_second", 25 + ) + + if isinstance(image_grid_thw, list): + image_grid_thw = torch.tensor(image_grid_thw) + if isinstance(video_grid_thw, list): + video_grid_thw = torch.tensor(video_grid_thw) + + src_item = input_tokens + audio_seqlens = audio_feature_lengths + if not second_per_grid_ts: + second_per_grid_ts = [1] * video_grid_thw.shape[0] + audio_idx = 0 + video_idx = 0 + image_idx = 0 + new_src_item: list[int] = [] + llm_pos_ids_list: list[torch.Tensor] = [] + + idx = 0 + while idx < len(src_item): + new_src_item_len = len(new_src_item) + start_idx = ( + llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + ) + if src_item[idx] not in [audio_token_id, video_token_id, image_token_id]: + if use_audio_in_video and idx > 0: + if ( + src_item[idx] == vision_end_token_id + and src_item[idx - 1] == audio_end_token_id + ): + # processing the <|audio_eos|> before <|vision_eos|> + start_idx -= 1 + elif ( + src_item[idx] == audio_start_token_id + and src_item[idx - 1] == vision_start_token_id + ): + # processing the <|audio_bos|> after <|vision_eos|> + start_idx -= 1 + new_src_item.append(src_item[idx]) + llm_pos_ids = torch.tensor([start_idx], dtype=torch.long).expand(3, -1) + llm_pos_ids_list.append(llm_pos_ids) + elif src_item[idx] == audio_token_id: + assert audio_seqlens is not None + audio_seqlen = audio_seqlens[audio_idx] + place_num = ((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1 + new_src_item.extend([audio_token_id] * place_num) + llm_pos_ids = torch.arange(place_num).expand(3, -1) + start_idx + llm_pos_ids_list.append(llm_pos_ids) + audio_idx += 1 + elif src_item[idx] == image_token_id: + grid_t = image_grid_thw[image_idx][0] + grid_hs = image_grid_thw[:, 1] + grid_ws = image_grid_thw[:, 2] + t_index = (torch.arange(grid_t) * 1 * tokens_per_second).long() + llm_pos_ids = get_llm_pos_ids_for_vision( + start_idx, image_idx, spatial_merge_size, t_index, grid_hs, grid_ws + ) + llm_pos_ids_list.append(llm_pos_ids) + vision_seqlen = image_grid_thw[image_idx].prod() // ( + spatial_merge_size**2 + ) + new_src_item.extend([image_token_id] * vision_seqlen) + image_idx += 1 + elif src_item[idx] == video_token_id and not use_audio_in_video: + grid_t = video_grid_thw[video_idx][0] + grid_hs = video_grid_thw[:, 1] + grid_ws = video_grid_thw[:, 2] + t_index = ( + torch.arange(grid_t) + * second_per_grid_ts[video_idx] + * tokens_per_second + ).long() + llm_pos_ids = get_llm_pos_ids_for_vision( + start_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws + ) + llm_pos_ids_list.append(llm_pos_ids) + vision_seqlen = video_grid_thw[video_idx].prod() // ( + spatial_merge_size**2 + ) + new_src_item.extend([video_token_id] * vision_seqlen) + video_idx += 1 + else: + # read audio from video + assert audio_seqlens is not None + audio_seqlen = audio_seqlens[audio_idx] + vision_seqlen = video_grid_thw[video_idx].prod() // ( + spatial_merge_size**2 + ) + grid_t = video_grid_thw[video_idx][0] + grid_h = video_grid_thw[video_idx][1] + grid_w = video_grid_thw[video_idx][2] + grid_hs = video_grid_thw[:, 1] + grid_ws = video_grid_thw[:, 2] + t_ntoken_per_chunk = int(tokens_per_second * seconds_per_chunk) + t_index = ( + torch.arange(grid_t) + * second_per_grid_ts[video_idx] + * tokens_per_second + ).long() + t_index_split_chunk = split_list_into_ranges( + t_index, t_ntoken_per_chunk + ) + place_num = (((audio_seqlen - 1) // 2 + 1 - 2) // 2 + 1) + 2 + pure_audio_len = place_num - 2 + added_audio_len = 0 + audio_llm_pos_ids_list: list[torch.Tensor] = [] + for t_chunk in t_index_split_chunk: + vision_ntoken_per_chunk = ( + len(t_chunk) * grid_h * grid_w // (spatial_merge_size**2) + ) + new_src_item.extend([video_token_id] * vision_ntoken_per_chunk) + vision_llm_pos_ids_list = get_llm_pos_ids_for_vision( + start_idx, + video_idx, + spatial_merge_size, + t_chunk, + grid_hs, + grid_ws, + ).split(1, dim=1) + llm_pos_ids_list.extend(vision_llm_pos_ids_list) + new_src_item.extend( + min(t_ntoken_per_chunk, pure_audio_len - added_audio_len) + * [audio_token_id] + ) + audio_start_idx = ( + start_idx + if len(audio_llm_pos_ids_list) == 0 + else audio_llm_pos_ids_list[-1][0].item() + 1 + ) + if min(t_ntoken_per_chunk, pure_audio_len - added_audio_len) > 0: + audio_llm_pos_ids_list = ( + torch.arange( + min( + t_ntoken_per_chunk, pure_audio_len - added_audio_len + ) + ).expand(3, -1) + + audio_start_idx + ).split(1, dim=1) + else: + audio_llm_pos_ids_list = [] + added_audio_len += min( + t_ntoken_per_chunk, pure_audio_len - added_audio_len + ) + llm_pos_ids_list.extend(audio_llm_pos_ids_list) + if added_audio_len < pure_audio_len: + new_src_item.extend( + (pure_audio_len - added_audio_len) * [audio_token_id] + ) + audio_llm_pos_ids_list = ( + torch.arange(pure_audio_len - added_audio_len).expand(3, -1) + + llm_pos_ids_list[-1].max() + + 1 + ).split(1, dim=1) + llm_pos_ids_list.extend(audio_llm_pos_ids_list) + audio_idx += 1 + video_idx += 1 + # move to the next token + idx += len(new_src_item) - new_src_item_len + + llm_positions = torch.cat(llm_pos_ids_list, dim=1) + mrope_position_delta = ( + torch.cat(llm_pos_ids_list, dim=1).max() + 1 - len(src_item) + ) + llm_positions = llm_positions[:, context_len:seq_len] + + return llm_positions, mrope_position_delta + def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings: mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs) if not mm_input_by_modality: diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 9cd83f61d9213..094fd90aac4e5 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -34,7 +34,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange -from transformers import BatchFeature +from transformers import BatchFeature, PretrainedConfig from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( Qwen2_5_VLConfig, @@ -79,6 +79,7 @@ from .interfaces import ( MultiModalEmbeddings, SupportsEagle3, SupportsLoRA, + SupportsMRoPE, SupportsMultiModal, SupportsMultiModalPruning, SupportsPP, @@ -1053,6 +1054,7 @@ class Qwen2_5_VLForConditionalGeneration( SupportsQuant, SupportsEagle3, SupportsMultiModalPruning, + SupportsMRoPE, ): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], @@ -1073,6 +1075,132 @@ class Qwen2_5_VLForConditionalGeneration( supports_encoder_tp_data = True + @classmethod + def get_mrope_input_positions( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + second_per_grid_ts: list[float], + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value.""" + + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + vision_start_token_id = hf_config.vision_start_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + tokens_per_second = getattr(hf_config.vision_config, "tokens_per_second", 1.0) + + input_tokens_tensor = torch.tensor(input_tokens) + vision_start_indices = torch.argwhere( + input_tokens_tensor == vision_start_token_id + ).squeeze(1) + vision_tokens = input_tokens_tensor[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_videos = image_nums, video_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + video_nums): + video_second_per_grid_t = 0.0 + if remain_images > 0: + try: + ed_image = input_tokens.index(image_token_id, st) + except ValueError: + ed_image = len(input_tokens) + 1 + else: + ed_image = len(input_tokens) + 1 + if remain_videos > 0: + try: + ed_video = input_tokens.index(video_token_id, st) + except ValueError: + ed_video = len(input_tokens) + 1 + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_second_per_grid_t = 1.0 + if second_per_grid_ts: + video_second_per_grid_t = second_per_grid_ts[video_index] + video_index += 1 + remain_videos -= 1 + ed = ed_video + + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h // spatial_merge_size, + w // spatial_merge_size, + ) + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + t_index = ( + ( + torch.arange(llm_grid_t) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + * video_second_per_grid_t + * tokens_per_second + ) + .long() + .flatten() + ) + + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx + ) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + llm_positions = llm_positions[:, context_len:seq_len] + + return llm_positions, mrope_position_delta + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 8862e88bd531f..1e6c3485c4d60 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -33,7 +33,7 @@ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -from transformers import BatchFeature +from transformers import BatchFeature, PretrainedConfig from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast from transformers.models.qwen2_vl.image_processing_qwen2_vl import ( smart_resize as image_smart_resize, @@ -84,6 +84,7 @@ from vllm.utils import is_list_of from .interfaces import ( MultiModalEmbeddings, SupportsLoRA, + SupportsMRoPE, SupportsMultiModal, SupportsPP, ) @@ -1174,7 +1175,7 @@ class Qwen3LLMForCausalLM(Qwen3ForCausalLM): dummy_inputs=Qwen3VLDummyInputsBuilder, ) class Qwen3VLForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP + nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): packed_modules_mapping = { "qkv_proj": [ @@ -1480,6 +1481,116 @@ class Qwen3VLForConditionalGeneration( ) return mm_input_by_modality + @classmethod + def get_mrope_input_positions( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + context_len: int = 0, + seq_len: Optional[int] = None, + second_per_grid_ts: Optional[list[float]] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value.""" + + video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)] + + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + vision_start_token_id = hf_config.vision_start_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + + input_tokens_tensor = torch.tensor(input_tokens) + vision_start_indices = torch.argwhere( + input_tokens_tensor == vision_start_token_id + ).squeeze(1) + vision_tokens = input_tokens_tensor[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_videos = image_nums, video_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + video_nums): + if image_token_id in input_tokens and remain_images > 0: + ed_image = input_tokens.index(image_token_id, st) + else: + ed_image = len(input_tokens) + 1 + if video_token_id in input_tokens and remain_videos > 0: + ed_video = input_tokens.index(video_token_id, st) + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_index += 1 + remain_videos -= 1 + ed = ed_video + + llm_grid_t, llm_grid_h, llm_grid_w = ( + t, + h // spatial_merge_size, + w // spatial_merge_size, + ) + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + t_index = ( + torch.arange(llm_grid_t) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + .flatten() + ) + h_index = ( + torch.arange(llm_grid_h) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_index = ( + torch.arange(llm_grid_w) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx + ) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx + ) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + llm_positions = llm_positions[:, context_len:seq_len] + return llm_positions, mrope_position_delta + def get_language_model(self) -> torch.nn.Module: return self.language_model diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 2a64f6865f12a..bd530be73c2ad 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -410,6 +410,14 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str: return " + ".join(_embedding_count_expression(inner) for inner in embeddings) +def split_list_into_ranges(lst: torch.Tensor, interval: int) -> list[list[int]]: + ranges: list[list[int]] = [[] for _ in range((max(lst) // interval) + 1)] + for num in lst: + index = num // interval + ranges[index].append(num) + return ranges + + def _merge_multimodal_embeddings( inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a323835e575cc..ec824f6d6bf5e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -875,30 +875,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if mm_input.get("use_audio_in_video") is True: use_audio_in_video = True - if supports_mrope(self.get_model()): - req_state.mrope_positions, req_state.mrope_position_delta = ( - self.model.get_mrope_input_positions( - req_state.prompt_token_ids, - hf_config=self.model_config.hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) - ) - else: - req_state.mrope_positions, req_state.mrope_position_delta = ( - MRotaryEmbedding.get_input_positions_tensor( - req_state.prompt_token_ids, - hf_config=self.model_config.hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) + assert supports_mrope(self.get_model()), "M-RoPE support is not implemented." + + req_state.mrope_positions, req_state.mrope_position_delta = ( + self.model.get_mrope_input_positions( + req_state.prompt_token_ids, + hf_config=self.model_config.hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, ) + ) def _extract_mm_kwargs( self, @@ -2900,7 +2889,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logger.info("Loading drafter model...") self.drafter.load_model(self.model) if self.use_aux_hidden_state_outputs: - if not supports_eagle3(self.model): + if not supports_eagle3(self.get_model()): raise RuntimeError( "Model does not support EAGLE3 interface but " "aux_hidden_state_outputs was requested" @@ -2928,7 +2917,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): prepare_communication_buffer_for_model(self.model) self.is_multimodal_pruning_enabled = ( - supports_multimodal_pruning(self.model) + supports_multimodal_pruning(self.get_model()) and self.model_config.multimodal_config.is_multimodal_pruning_enabled() ) From 086609de64456bfcfa44e1f9236a940122156b02 Mon Sep 17 00:00:00 2001 From: ihb2032 <40718643+ihb2032@users.noreply.github.com> Date: Sat, 11 Oct 2025 17:12:16 +0800 Subject: [PATCH 07/30] fix(nix): Allow local oneDNN path to fix vLLM CPU build failure (#26401) Signed-off-by: lyd1992 Signed-off-by: ihb2032 <1355790728@qq.com> --- cmake/cpu_extension.cmake | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index a6e53588f4f0f..9bac5ea41c8d4 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -198,13 +198,24 @@ else() endif() if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) - FetchContent_Declare( - oneDNN - GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.9 - GIT_PROGRESS TRUE - GIT_SHALLOW TRUE - ) + set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.") + + if(FETCHCONTENT_SOURCE_DIR_ONEDNN) + message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}") + FetchContent_Declare( + oneDNN + SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN} + ) + else() + message(STATUS "Downloading oneDNN from GitHub") + FetchContent_Declare( + oneDNN + GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git + GIT_TAG v3.9 + GIT_PROGRESS TRUE + GIT_SHALLOW TRUE + ) + endif() if(USE_ACL) find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/) @@ -227,7 +238,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON set(ONEDNN_ENABLE_ITT_TASKS "OFF") set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") - set(ONEDNN_VERBOSE "ON") + set(ONEDNN_VERBOSE "OFF") set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) FetchContent_MakeAvailable(oneDNN) From d2a71530c159c361f991a1ed986e64209651cc92 Mon Sep 17 00:00:00 2001 From: Rahul Tuli Date: Sat, 11 Oct 2025 15:44:41 +0530 Subject: [PATCH 08/30] Add EAGLE-3 Speculative Decoding Support for Qwen3 MoE (#26485) Signed-off-by: Rahul Tuli --- vllm/model_executor/models/qwen3_moe.py | 37 ++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 34b5af846493a..825272535a450 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -64,7 +64,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.models.utils import sequence_parallel_chunk from vllm.sequence import IntermediateTensors -from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP +from .interfaces import MixtureOfExperts, SupportsEagle3, SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, PPMissingLayer, @@ -422,6 +422,8 @@ class Qwen3MoeModel(nn.Module): self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size ) + # Track layers for auxiliary hidden state outputs (EAGLE3) + self.aux_hidden_state_layers: tuple[int, ...] = () def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) @@ -432,7 +434,9 @@ class Qwen3MoeModel(nn.Module): positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: + ) -> Union[ + torch.Tensor, IntermediateTensors, tuple[torch.Tensor, list[torch.Tensor]] + ]: if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds @@ -443,13 +447,29 @@ class Qwen3MoeModel(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in islice(self.layers, self.start_layer, self.end_layer): + + aux_hidden_states = [] + for layer_idx, layer in enumerate( + islice(self.layers, self.start_layer, self.end_layer), + start=self.start_layer, + ): + # Collect auxiliary hidden states if specified + if layer_idx in self.aux_hidden_state_layers: + aux_hidden_state = ( + hidden_states + residual if residual is not None else hidden_states + ) + aux_hidden_states.append(aux_hidden_state) hidden_states, residual = layer(positions, hidden_states, residual) + if not get_pp_group().is_last_rank: return IntermediateTensors( {"hidden_states": hidden_states, "residual": residual} ) hidden_states, _ = self.norm(hidden_states, residual) + + # Return auxiliary hidden states if collected + if len(aux_hidden_states) > 0: + return hidden_states, aux_hidden_states return hidden_states def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: @@ -606,7 +626,9 @@ class Qwen3MoeModel(nn.Module): return loaded_params -class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExperts): +class Qwen3MoeForCausalLM( + nn.Module, SupportsPP, SupportsLoRA, SupportsEagle3, MixtureOfExperts +): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -702,6 +724,13 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExperts) moe.n_redundant_experts = self.num_redundant_experts moe.experts.update_expert_map() + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.model.aux_hidden_state_layers = layers + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + num_layers = len(self.model.layers) + return (2, num_layers // 2, num_layers - 3) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) From f7ee69868a26a7f1ce3e3c0086b6bcbee266e204 Mon Sep 17 00:00:00 2001 From: muzian666 <94029822+muzian666@users.noreply.github.com> Date: Sat, 11 Oct 2025 20:04:04 +0800 Subject: [PATCH 09/30] [CPU] fix the issue when the node is '-' cause json decode error. (#26562) Signed-off-by: muzian666 Co-authored-by: qingan.li --- vllm/platforms/cpu.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 2f87664003dcd..49c953fd36ee0 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -4,6 +4,7 @@ import json import os import platform +import re import subprocess import sys from dataclasses import dataclass @@ -336,6 +337,7 @@ class CpuPlatform(Platform): lscpu_output = subprocess.check_output( "lscpu -J -e=CPU,CORE,NODE", shell=True, text=True ) + lscpu_output = re.sub(r'"node":\s*-\s*(,|\n)', r'"node": 0\1', lscpu_output) logical_cpu_list: list[LogicalCPUInfo] = json.loads( lscpu_output, object_hook=LogicalCPUInfo.json_decoder )["cpus"] From d0bed837ac83aa33f6540d32cbcc75525328078e Mon Sep 17 00:00:00 2001 From: Chauncey Date: Sat, 11 Oct 2025 20:04:49 +0800 Subject: [PATCH 10/30] [Refactor]Reduce duplicate code in serving_chat (#26627) Signed-off-by: chaunceyjiang --- vllm/entrypoints/openai/serving_chat.py | 44 ++++-------------- vllm/entrypoints/openai/serving_engine.py | 47 +++++++++++++++++++- vllm/entrypoints/openai/serving_responses.py | 14 +----- 3 files changed, 56 insertions(+), 49 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 51c618e9d51d7..94c24ce9b307a 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -6,7 +6,7 @@ import json import time from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence -from typing import Callable, Final, Optional, Union +from typing import Final, Optional, Union import jinja2 import partial_json_parser @@ -56,14 +56,13 @@ from vllm.entrypoints.openai.protocol import ( ) from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager +from vllm.entrypoints.openai.tool_parsers import ToolParser from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall from vllm.entrypoints.utils import get_max_tokens from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger from vllm.logprobs import Logprob from vllm.outputs import CompletionOutput, RequestOutput -from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.transformers_utils.tokenizers import ( @@ -112,42 +111,15 @@ class OpenAIServingChat(OpenAIServing): self.trust_request_chat_template = trust_request_chat_template self.enable_log_outputs = enable_log_outputs + # set up reasoning parser + self.reasoning_parser = self._get_reasoning_parser( + reasoning_parser_name=reasoning_parser + ) # set up tool use self.enable_auto_tools: bool = enable_auto_tools - if self.enable_auto_tools: - logger.info( - '"auto" tool choice has been enabled please note that while' - " the parallel_tool_calls client option is preset for " - "compatibility reasons, it will be ignored." - ) - - self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = ( - None + self.tool_parser = self._get_tool_parser( + tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools ) - if reasoning_parser: - try: - self.reasoning_parser = ReasoningParserManager.get_reasoning_parser( - reasoning_parser - ) - assert self.reasoning_parser is not None - except Exception as e: - raise TypeError(f"{reasoning_parser=} has not been registered") from e - self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None - if self.enable_auto_tools: - try: - if tool_parser == "pythonic" and self.model_config.model.startswith( - "meta-llama/Llama-3.2" - ): - logger.warning( - "Llama3.2 models may struggle to emit valid pythonic tool calls" - ) - self.tool_parser = ToolParserManager.get_tool_parser(tool_parser) - except Exception as e: - raise TypeError( - "Error: --enable-auto-tool-choice requires " - f"tool_parser:'{tool_parser}' which has not " - "been registered" - ) from e self.exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none self.enable_prompt_tokens_details = enable_prompt_tokens_details diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index edb8ecc94382a..0d1a525c6d3da 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -63,7 +63,7 @@ from vllm.entrypoints.openai.protocol import ( TranslationRequest, ) from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.entrypoints.openai.tool_parsers import ToolParser +from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer, RenderConfig from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs.data import PromptType @@ -82,6 +82,7 @@ from vllm.multimodal import ( # noqa: F401 - Required to resolve Pydantic error ) from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams +from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.tracing import ( contains_trace_headers, @@ -274,6 +275,50 @@ class OpenAIServing: self.model_config = self.models.model_config self.max_model_len = self.model_config.max_model_len + def _get_tool_parser( + self, tool_parser_name: Optional[str] = None, enable_auto_tools: bool = False + ) -> Optional[Callable[[AnyTokenizer], ToolParser]]: + """Get the tool parser based on the name.""" + parser = None + if not enable_auto_tools or tool_parser_name is None: + return parser + logger.info( + '"auto" tool choice has been enabled please note that while' + " the parallel_tool_calls client option is preset for " + "compatibility reasons, it will be ignored." + ) + + try: + if tool_parser_name == "pythonic" and self.model_config.model.startswith( + "meta-llama/Llama-3.2" + ): + logger.warning( + "Llama3.2 models may struggle to emit valid pythonic tool calls" + ) + parser = ToolParserManager.get_tool_parser(tool_parser_name) + except Exception as e: + raise TypeError( + "Error: --enable-auto-tool-choice requires " + f"tool_parser:'{tool_parser_name}' which has not " + "been registered" + ) from e + return parser + + def _get_reasoning_parser( + self, + reasoning_parser_name: str, + ) -> Optional[Callable[[AnyTokenizer], ReasoningParser]]: + """Get the reasoning parser based on the name.""" + parser = None + if not reasoning_parser_name: + return None + try: + parser = ReasoningParserManager.get_reasoning_parser(reasoning_parser_name) + assert parser is not None + except Exception as e: + raise TypeError(f"{reasoning_parser_name=} has not been registered") from e + return parser + async def reset_mm_cache(self) -> None: self.processor.clear_mm_cache() await self.engine_client.reset_mm_cache() diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 48c5222bccc95..60f8b78ed1757 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -96,7 +96,6 @@ from vllm.logger import init_logger from vllm.logprobs import Logprob as SampleLogprob from vllm.logprobs import SampleLogprobs from vllm.outputs import CompletionOutput -from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import random_uuid @@ -136,18 +135,9 @@ class OpenAIServingResponses(OpenAIServing): self.chat_template_content_format: Final = chat_template_content_format self.enable_log_outputs = enable_log_outputs - self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = ( - None + self.reasoning_parser = self._get_reasoning_parser( + reasoning_parser_name=reasoning_parser ) - if reasoning_parser: - try: - self.reasoning_parser = ReasoningParserManager.get_reasoning_parser( - reasoning_parser - ) - assert self.reasoning_parser is not None - except Exception as e: - raise TypeError(f"{reasoning_parser=} has not been registered") from e - self.enable_prompt_tokens_details = enable_prompt_tokens_details self.enable_force_include_usage = enable_force_include_usage self.default_sampling_params = self.model_config.get_diff_sampling_param() From a25f2adee9d66bf16128f2a8f399c558fd181647 Mon Sep 17 00:00:00 2001 From: Angela Yi Date: Sat, 11 Oct 2025 05:44:43 -0700 Subject: [PATCH 11/30] [compile] Add patched_fused_scaled_matmul_reduce_scatter (#26604) Signed-off-by: angelayi --- tests/compile/test_async_tp.py | 26 ++++++-- vllm/compilation/collective_fusion.py | 4 +- vllm/distributed/parallel_state.py | 95 +++++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 6 deletions(-) diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 88ad4f81df505..d396d3940f67f 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -142,7 +142,7 @@ class TestScaledMMRSModel(_BaseScaledMMModel): return [torch.ops.vllm.reduce_scatter.default] def ops_in_model_after(self): - return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default] + return [torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter.default] class TestAGScaledMMModel(_BaseScaledMMModel): @@ -195,7 +195,7 @@ class TestCutlassScaledMMRSModel(_BaseScaledMMModel): return [torch.ops.vllm.reduce_scatter.default] def ops_in_model_after(self): - return [torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default] + return [torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter.default] class TestAGCutlassScaledMMModel(_BaseScaledMMModel): @@ -243,9 +243,15 @@ class TestAGCutlassScaledMMModel(_BaseScaledMMModel): @pytest.mark.parametrize("seq_len", [16]) @pytest.mark.parametrize("hidden_size", [16]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("dynamic", [True, False]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA") def test_async_tp_pass_replace( - test_model: str, batch_size: int, seq_len: int, hidden_size: int, dtype: torch.dtype + test_model: str, + batch_size: int, + seq_len: int, + hidden_size: int, + dtype: torch.dtype, + dynamic: bool, ): if ( test_model @@ -269,7 +275,15 @@ def test_async_tp_pass_replace( # torch.distributed and cuda torch.multiprocessing.spawn( fn, - args=(num_processes, test_model, batch_size, seq_len, hidden_size, dtype), + args=( + num_processes, + test_model, + batch_size, + seq_len, + hidden_size, + dtype, + dynamic, + ), nprocs=nprocs, ) @@ -284,6 +298,7 @@ def async_tp_pass_on_test_model( seq_len: int, hidden_size: int, dtype: torch.dtype, + dynamic: bool, ): current_platform.seed_everything(0) @@ -331,6 +346,9 @@ def async_tp_pass_on_test_model( (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False ) + if dynamic: + torch._dynamo.mark_dynamic(hidden_states, 0) + compiled_model = torch.compile(model, backend=backend) compiled_model(hidden_states) diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 970d390f32b45..988a1069cd9e7 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -172,7 +172,7 @@ class ScaledMMReduceScatterPattern(BasePattern): # Calculate output shape: input @ mat2 with scatter_dim reduced output_shape = [*input.shape[:-1], mat2.shape[1]] scatter_dim = 0 - gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter( + gemm_rs = torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter( input, mat2, scale_a, @@ -307,7 +307,7 @@ class CutlassScaledMMReduceScatterPattern(BasePattern): # Calculate output shape: input @ mat2 with scatter_dim reduced output_shape = [*input.shape[:-1], mat2.shape[1]] scatter_dim = 0 - gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter( + gemm_rs = torch.ops.vllm.patched_fused_scaled_matmul_reduce_scatter( input, mat2, scale_a, diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index aee5507ade467..cb5a75c59f096 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -37,6 +37,8 @@ from unittest.mock import patch import torch import torch.distributed +import torch.distributed._functional_collectives as funcol +import torch.distributed._symmetric_memory from torch.distributed import Backend, ProcessGroup from typing_extensions import deprecated @@ -159,6 +161,90 @@ def all_gather_fake( return torch.empty(new_shape, dtype=tensor.dtype, device=tensor.device) +def patched_fused_scaled_matmul_reduce_scatter_fake( + A: torch.Tensor, + B: torch.Tensor, + A_scale: torch.Tensor, + B_scale: torch.Tensor, + reduce_op: str, + orig_scatter_dim: int, + scatter_dim_after_maybe_reshape: int, + group_name: str, + output_shape: list[int], + bias: torch.Tensor | None = None, + result_scale: torch.Tensor | None = None, + out_dtype: torch.dtype | None = None, + use_fast_accum: bool = False, +) -> torch.Tensor: + # Copied from + # https://github.com/pytorch/pytorch/blob/50c338c2da905062449e4d9ac807832d1b5cd90e/torch/distributed/_symmetric_memory/__init__.py#L1189 + if A_scale.numel() > 1: + if A_scale.shape[:-1] != A.shape[:-1]: + raise ValueError( + "For row-wise scaling, the leading dims of A_scale " + "must match the leading dims of A " + f"(A shape: {A.shape}, A_scale shape: {A_scale.shape})" + ) + A_scale = A_scale.flatten(0, -2).contiguous() + elif A_scale.numel() != 1: + raise ValueError( + "Invalid A_scale shape " + f"(A shape: {A.shape}, A_scale shape: {A_scale.shape})" + ) + + C = torch._scaled_mm( + A.flatten(0, -2).contiguous(), + B, + A_scale, + B_scale, + bias, + result_scale, + out_dtype, + use_fast_accum, + ) + C = C.view(*output_shape[:-1], B.shape[1]) + res = funcol.reduce_scatter_tensor( + C, + reduce_op, + orig_scatter_dim, # need original scatter dim for 3D+ output tensor here + group_name, + ) + res = funcol.wait_tensor(res) + return res + + +def patched_fused_scaled_matmul_reduce_scatter( + A: torch.Tensor, + B: torch.Tensor, + A_scale: torch.Tensor, + B_scale: torch.Tensor, + reduce_op: str, + orig_scatter_dim: int, + scatter_dim_after_maybe_reshape: int, + group_name: str, + output_shape: list[int], + bias: torch.Tensor | None = None, + result_scale: torch.Tensor | None = None, + out_dtype: torch.dtype | None = None, + use_fast_accum: bool = False, +) -> torch.Tensor: + return torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter( + A, + B, + A_scale, + B_scale, + reduce_op, + orig_scatter_dim, + scatter_dim_after_maybe_reshape, + group_name, + output_shape, + bias, + result_scale, + out_dtype, + use_fast_accum, + ) + + if supports_custom_op(): direct_register_custom_op( op_name="all_reduce", @@ -178,6 +264,15 @@ if supports_custom_op(): fake_impl=all_gather_fake, ) + # TODO: Remove this once the pytorch fix + # (https://github.com/pytorch/pytorch/pull/165086) gets released, + # in either 2.9.1 or 2.10 + direct_register_custom_op( + op_name="patched_fused_scaled_matmul_reduce_scatter", + op_func=patched_fused_scaled_matmul_reduce_scatter, + fake_impl=patched_fused_scaled_matmul_reduce_scatter_fake, + ) + class GroupCoordinator: """ From 9d6cff3edeb2421699671881592fd7558946695e Mon Sep 17 00:00:00 2001 From: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Date: Sat, 11 Oct 2025 20:58:33 +0800 Subject: [PATCH 12/30] [Bugfix][Qwen3VL] fix deepstack in qwen3vl (#26626) Signed-off-by: liuye.hj Signed-off-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Co-authored-by: liuye.hj --- vllm/model_executor/models/qwen3_vl.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 1e6c3485c4d60..6a7d2eaeab3b8 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1702,12 +1702,6 @@ class Qwen3VLForConditionalGeneration( ) if deepstack_input_embeds is not None: - deepstack_input_embeds = ( - torch.zeros_like(inputs_embeds) - .unsqueeze(0) - .repeat(self.deepstack_num_level, 1, 1) - .contiguous() - ) self._set_deepstack_input_embeds(deepstack_input_embeds) return inputs_embeds From f0a30a067bacb9f3aaec1cc7a7efe005d6b2ff30 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 11 Oct 2025 23:21:33 +0800 Subject: [PATCH 13/30] [Bugfix] Fix qwen-moe packed_modules_mapping (#26634) Signed-off-by: Jee Jee Li --- vllm/model_executor/models/interfaces.py | 2 +- vllm/model_executor/models/qwen2_moe.py | 18 +++++++++++++----- vllm/model_executor/models/qwen3_moe.py | 14 +++++++++----- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 38c9d5abb5877..68915d60ef480 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -325,7 +325,7 @@ class SupportsLoRA(Protocol): # are empty by default. embedding_modules: ClassVar[dict[str, str]] = {} embedding_padding_modules: ClassVar[list[str]] = [] - packed_modules_mapping: ClassVar[dict[str, list[str]]] = {} + packed_modules_mapping: dict[str, list[str]] = {} # We can't use runtime_checkable with ClassVar for issubclass checks diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index c57299a2d390f..7251e7b2eea49 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -534,11 +534,7 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): "q_proj", "k_proj", "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], + ] } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -547,6 +543,18 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config + # Only perform the following mapping when Qwen2MoeMLP exists + if ( + getattr(config, "mlp_only_layers", []) + or config.shared_expert_intermediate_size > 0 + ): + self.packed_modules_mapping["gate_up_proj"] = ( + [ + "gate_proj", + "up_proj", + ], + ) + self.model = Qwen2MoeModel( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") ) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 825272535a450..0769378933d52 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -634,11 +634,7 @@ class Qwen3MoeForCausalLM( "q_proj", "k_proj", "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], + ] } fall_back_to_pt_during_load = False @@ -649,6 +645,14 @@ class Qwen3MoeForCausalLM( quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config + # Only perform the following mapping when Qwen3MoeMLP exists + if getattr(config, "mlp_only_layers", []): + self.packed_modules_mapping["gate_up_proj"] = ( + [ + "gate_proj", + "up_proj", + ], + ) self.model = Qwen3MoeModel( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") ) From 5be7ca1b99c91751c515371298d2e9c3ff21941c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 12 Oct 2025 01:45:32 +0800 Subject: [PATCH 14/30] [Benchmark] Support Infinity API (#26641) Signed-off-by: DarkLight1337 --- vllm/benchmarks/datasets.py | 2 +- vllm/benchmarks/lib/endpoint_request_func.py | 123 ++++++++++++++----- 2 files changed, 96 insertions(+), 29 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 7ffc21905924c..8e71a7bfb1293 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1584,7 +1584,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: if dataset_class.IS_MULTIMODAL and not ( args.backend in ("openai-chat", "openai-audio") - or "openai-embeddings-" in args.backend + or "embeddings-" in args.backend ): # multi-modal benchmark is only available on OpenAI Chat # endpoint-type. diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 34dce5edb0c74..28146ce6200d1 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -581,29 +581,6 @@ async def async_request_openai_embeddings_chat( ) -async def async_request_openai_embeddings_clip( - request_func_input: RequestFuncInput, - session: aiohttp.ClientSession, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - if request_func_input.multi_modal_content: - # Image input - request_func_input.prompt = "" - - # max_model_len=77 is too short for most datasets, - # so by default we truncate the prompt to max_model_len - if request_func_input.extra_body is None: - request_func_input.extra_body = {} - if "truncate_prompt_tokens" not in request_func_input.extra_body: - request_func_input.extra_body["truncate_prompt_tokens"] = -1 - - return await async_request_openai_embeddings_chat( - request_func_input, - session, - pbar=pbar, - ) - - def _try_extract_request_idx(request_func_input: RequestFuncInput): if request_func_input.request_id: match = re.search(r"(\d+)$", request_func_input.request_id) @@ -616,11 +593,20 @@ def _try_extract_request_idx(request_func_input: RequestFuncInput): return None -async def async_request_openai_embeddings_vlm2vec( - request_func_input: RequestFuncInput, - session: aiohttp.ClientSession, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: +def _preprocess_clip(request_func_input: RequestFuncInput): + if request_func_input.multi_modal_content: + # Image input + request_func_input.prompt = "" + + # max_model_len=77 is too short for most datasets, + # so by default we truncate the prompt to max_model_len + if request_func_input.extra_body is None: + request_func_input.extra_body = {} + if "truncate_prompt_tokens" not in request_func_input.extra_body: + request_func_input.extra_body["truncate_prompt_tokens"] = -1 + + +def _preprocess_vlm2vec(request_func_input: RequestFuncInput): if request_func_input.multi_modal_content: request_idx = _try_extract_request_idx(request_func_input) @@ -637,6 +623,28 @@ async def async_request_openai_embeddings_vlm2vec( f"{request_func_input.prompt}" ) + +async def async_request_openai_embeddings_clip( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + _preprocess_clip(request_func_input) + + return await async_request_openai_embeddings_chat( + request_func_input, + session, + pbar=pbar, + ) + + +async def async_request_openai_embeddings_vlm2vec( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + _preprocess_vlm2vec(request_func_input) + return await async_request_openai_embeddings_chat( request_func_input, session, @@ -645,6 +653,61 @@ async def async_request_openai_embeddings_vlm2vec( ) +async def async_request_infinity_embeddings( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + _validate_api_url(api_url, "Infinity Embeddings API", "embeddings") + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + } + + if request_func_input.prompt: + payload["input"] = request_func_input.prompt + else: + mm_content = request_func_input.multi_modal_content + assert isinstance(mm_content, dict) + + mm_type = mm_content["type"] + payload["input"] = mm_content[mm_type]["url"] + payload["modality"] = mm_type.split("_", 1)[0] + + _update_payload_common(payload, request_func_input) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + _update_headers_common(headers, request_func_input) + + return await _run_openai_embeddings( + session, + api_url, + payload=payload, + headers=headers, + pbar=pbar, + ) + + +async def async_request_infinity_embeddings_clip( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + _preprocess_clip(request_func_input) + + return await async_request_infinity_embeddings( + request_func_input, + session, + pbar=pbar, + ) + + # TODO: Add more request functions for different API protocols. ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "vllm": async_request_openai_completions, @@ -655,6 +718,10 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "openai-embeddings-chat": async_request_openai_embeddings_chat, "openai-embeddings-clip": async_request_openai_embeddings_clip, "openai-embeddings-vlm2vec": async_request_openai_embeddings_vlm2vec, + # Infinity embedding server: https://github.com/michaelfeil/infinity + "infinity-embeddings": async_request_infinity_embeddings, + "infinity-embeddings-clip": async_request_infinity_embeddings_clip, + # (Infinity embedding server does not support vlm2vec) } OPENAI_COMPATIBLE_BACKENDS = [ From 0cd103e7cbf0315c69434870c4973ded2c5d99e5 Mon Sep 17 00:00:00 2001 From: Huamin Li <3ericli@gmail.com> Date: Sat, 11 Oct 2025 13:50:57 -0700 Subject: [PATCH 15/30] =?UTF-8?q?CP:=20make=20correct=5Fattn=5Fout=20robus?= =?UTF-8?q?t=20to=204=E2=80=91D=20views=20and=20fix=20Triton=20arg=20bindi?= =?UTF-8?q?ng=20(#26509)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Huamin Li <3ericli@gmail.com> --- vllm/attention/ops/common.py | 52 +++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py index 097fbae68cda5..1234e1b2e46a8 100644 --- a/vllm/attention/ops/common.py +++ b/vllm/attention/ops/common.py @@ -117,14 +117,52 @@ def correct_attn_out( if ctx is None: ctx = CPTritonContext() - lse = torch.empty_like(lses[0]) + # --- Normalize to 3D views --- + if out.ndim == 4 and out.shape[1] == 1: + out = out.squeeze(1) + assert out.ndim == 3, f"expected out [B,H,D] or [B,1,H,D], got {tuple(out.shape)}" - grid = (out.shape[0], out.shape[1], 1) - regular_args = (out, out, lses, lse, *out.stride(), *lses.stride(), cp_rank) - const_args = { - "HEAD_DIM": out.shape[-1], - "N_ROUNDED": lses.shape[0], - } + if lses.ndim == 4 and lses.shape[-1] == 1: + lses = lses.squeeze(-1) + if lses.ndim == 4 and lses.shape[1] == 1: + lses = lses.squeeze(1) + assert lses.ndim == 3, ( + f"expected lses [N,B,H] (optionally with a 1-sized extra dim), " + f"got {tuple(lses.shape)}" + ) + + B, H, D = out.shape + N = lses.shape[0] + + # Strides after we normalized shapes to 3-D views. The kernel computes + # offsets for `vlse_ptr` using lses_stride_B/H, so the output buffer must + # have the same B/H stride layout as a slice of `lses`. + o_sB, o_sH, o_sD = out.stride() + l_sN, l_sB, l_sH = lses.stride() + + # Allocate LSE with the same B/H strides as `lses` so writes land correctly + # even when `lses` is a non-contiguous view (e.g., 4-D to 3-D squeeze). + lse = torch.empty_strided( + (B, H), (l_sB, l_sH), device=lses.device, dtype=lses.dtype + ) + + # Kernel launch config + grid = (B, H, 1) + + regular_args = ( + out, + out, + lses, + lse, + o_sB, + o_sH, + o_sD, + l_sN, + l_sB, + l_sH, + cp_rank, + ) + const_args = {"HEAD_DIM": D, "N_ROUNDED": N} ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args, **const_args) return out, lse From 01653a917b140535cb6768c995a4bd5ea3253120 Mon Sep 17 00:00:00 2001 From: Angela Yi Date: Sat, 11 Oct 2025 14:03:14 -0700 Subject: [PATCH 16/30] [compile] Fix inductor partition config (#26645) Signed-off-by: angelayi --- vllm/config/compilation.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index e65728ba7f4e1..4209f3a9731c1 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -709,9 +709,7 @@ class CompilationConfig: return self.level == CompilationLevel.PIECEWISE # Inductor partition case - return ( - self.level > CompilationLevel.NO_COMPILATION and self.backend == "inductor" - ) + return self.level > CompilationLevel.NO_COMPILATION and self.use_inductor def custom_op_log_check(self): """ From c5c8f5ea59f060f68b2257bb4d2066a264fc865c Mon Sep 17 00:00:00 2001 From: Haisheng Chen <60504847+HsChen-sys@users.noreply.github.com> Date: Sat, 11 Oct 2025 19:40:47 -0700 Subject: [PATCH 17/30] [EPLB] Support ernie4.5-moe (#22100) Signed-off-by: Haisheng Chen Signed-off-by: Haisheng Chen <60504847+HsChen-sys@users.noreply.github.com> Signed-off-by: Haisheng Chen Co-authored-by: Haisheng Chen --- vllm/model_executor/models/ernie45_moe.py | 139 ++++++++++++++++++++-- 1 file changed, 132 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index 7516cb5abaf9a..f0360d55a2e57 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -33,8 +33,12 @@ from transformers import PretrainedConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import ( + get_ep_group, + get_pp_group, + get_tensor_model_parallel_world_size, +) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import SharedFusedMoE @@ -58,7 +62,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ) from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA, SupportsPP +from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, PPMissingLayer, @@ -118,12 +122,34 @@ class Ernie4_5_MoeMoE(nn.Module): config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + enable_eplb: bool = False, ): super().__init__() layer_idx = extract_layer_index(prefix) self.layer_idx = layer_idx self.tp_size = get_tensor_model_parallel_world_size() + + self.moe_num_shared_experts = getattr(config, "moe_num_shared_experts", None) + self.ep_group = get_ep_group().device_group + self.ep_rank = self.ep_group.rank() + self.ep_size = self.ep_group.size() + self.n_routed_experts: int = config.moe_num_experts + self.n_shared_experts: int = self.moe_num_shared_experts + + # Load balancing settings. + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + self.enable_eplb = enable_eplb + + self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_logical_experts = self.n_routed_experts + self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + self.physical_expert_start = self.ep_rank * self.n_local_physical_experts + self.physical_expert_end = ( + self.physical_expert_start + self.n_local_physical_experts + ) self.has_shared_experts = getattr(config, "moe_num_shared_experts", 0) > 0 if self.tp_size > config.moe_num_experts: @@ -171,6 +197,8 @@ class Ernie4_5_MoeMoE(nn.Module): quant_config=quant_config, prefix=f"{prefix}.experts", e_score_correction_bias=self.gate.e_score_correction_bias, + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -298,6 +326,7 @@ class Ernie4_5_MoeDecoderLayer(nn.Module): cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + enable_eplb: bool = False, ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -338,7 +367,10 @@ class Ernie4_5_MoeDecoderLayer(nn.Module): and layer_idx <= moe_layer_end_index ): self.mlp = Ernie4_5_MoeMoE( - config=config, quant_config=quant_config, prefix=f"{prefix}.mlp" + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + enable_eplb=enable_eplb, ) else: self.mlp = Ernie4_5_MoeMLP( @@ -393,6 +425,9 @@ class Ernie4_5_MoeModel(nn.Module): self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.config = config + parallel_config = vllm_config.parallel_config + enable_eplb = parallel_config.enable_eplb + self.num_redundant_experts = parallel_config.num_redundant_experts if get_pp_group().is_first_rank: self.embed_tokens = VocabParallelEmbedding( @@ -411,6 +446,7 @@ class Ernie4_5_MoeModel(nn.Module): cache_config=cache_config, quant_config=quant_config, prefix=prefix, + enable_eplb=enable_eplb, ), prefix=f"{prefix}.layers", ) @@ -465,6 +501,7 @@ class Ernie4_5_MoeModel(nn.Module): ckpt_down_proj_name="down_proj", ckpt_up_proj_name="up_proj", num_experts=self.config.moe_num_experts, + num_redundant_experts=self.num_redundant_experts, ) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -513,15 +550,22 @@ class Ernie4_5_MoeModel(nn.Module): weight_loader(param, loaded_weight, shard_id) break else: + is_expert_weight = False for mapping in expert_params_mapping: param_name, weight_name, expert_id, shard_id = mapping if weight_name not in name: continue - name = name.replace(weight_name, param_name) + # Anyway, this is an expert weight and should not be + # attempted to load as other weights later + is_expert_weight = True + + # Do not modify `name` since the loop may continue here + # Instead, create a new variable + name_mapped = name.replace(weight_name, param_name) # Skip layers on other devices. - if is_pp_missing_parameter(name, self): + if is_pp_missing_parameter(name_mapped, self): continue # Skip loading extra bias for GPTQ models. @@ -541,6 +585,12 @@ class Ernie4_5_MoeModel(nn.Module): ) break else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it + continue + # Skip loading extra bias for GPTQ models. if ( name.endswith(".bias") or name.endswith("_bias") @@ -563,7 +613,7 @@ class Ernie4_5_MoeModel(nn.Module): return loaded_params -class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): +class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, MixtureOfExperts): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -605,6 +655,81 @@ class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): self.model.make_empty_intermediate_tensors ) + self.expert_weights = [] + + # Set MoE hyperparameters + moe_layers_indices = [ + i + for i in range(config.num_hidden_layers) + if ( + i >= config.moe_layer_start_index + and i <= config.moe_layer_end_index + and (i + 1) % config.moe_layer_interval == 0 + ) + ] + self.num_moe_layers = len(moe_layers_indices) + self.num_expert_groups = 1 + + self.moe_layers: list[SharedFusedMoE] = [] + example_moe = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + + assert isinstance(layer, Ernie4_5_MoeDecoderLayer) + if isinstance(layer.mlp, Ernie4_5_MoeMoE): + example_moe = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_moe is None: + logger.warning("No Ernie4_5_MoeMoE layer found in model.layers.") + self.num_logical_experts = 0 + self.num_physical_experts = 0 + self.num_local_physical_experts = 0 + self.num_routed_experts = 0 + self.num_shared_experts = 0 + self.num_redundant_experts = 0 + else: + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for layer in self.model.layers: + if isinstance(layer.mlp, Ernie4_5_MoeMoE): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) From 4ca204055ef094c933c015847be95fa87ed6443e Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Sun, 12 Oct 2025 14:04:44 +0800 Subject: [PATCH 18/30] Add @noooop to codeowner for pooling models (#26652) Signed-off-by: wang.yuqi --- .github/CODEOWNERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index dbcad3aa308f5..61ac9fefc59f4 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -121,3 +121,11 @@ mkdocs.yaml @hmellor # KVConnector installation files /requirements/kv_connectors.txt @NickLucche + +# Pooling models +/examples/*/pooling/ @noooop +/tests/models/*/pooling* @noooop +/tests/entrypoints/pooling @noooop +/vllm/config/pooler.py @noooop +/vllm/pooling_params.py @noooop +/vllm/model_executor/layers/pooler.py @noooop From 82e64c7a204671423c2c7914cbabdf06bc89c0c8 Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Sun, 12 Oct 2025 12:27:50 +0400 Subject: [PATCH 19/30] [PERF] [Qwen3-next] Speed up gated RMSNorm (#26207) Signed-off-by: Vadim Gimpelson Signed-off-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/kernels/test_fla_layernorm_guard.py | 388 ++++++++++++++++++ .../layers/fla/ops/layernorm_guard.py | 120 ++++-- 2 files changed, 475 insertions(+), 33 deletions(-) create mode 100644 tests/kernels/test_fla_layernorm_guard.py diff --git a/tests/kernels/test_fla_layernorm_guard.py b/tests/kernels/test_fla_layernorm_guard.py new file mode 100644 index 0000000000000..f944c6dcfa73b --- /dev/null +++ b/tests/kernels/test_fla_layernorm_guard.py @@ -0,0 +1,388 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch +import torch.nn.functional as F + +from vllm.model_executor.layers.fla.ops.layernorm_guard import ( + layer_norm_fwd, + layernorm_fn, + rms_norm_ref, +) +from vllm.platforms import current_platform + + +def layer_norm_ref( + x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True, + is_rms_norm=False, +): + """Reference implementation for both layer norm and RMS norm.""" + if is_rms_norm: + # Use the imported rms_norm_ref for RMS norm cases + return rms_norm_ref( + x, + weight, + bias, + z=z, + eps=eps, + group_size=group_size, + norm_before_gate=norm_before_gate, + upcast=True, + ) + + # Layer norm implementation + dtype = x.dtype + x = x.float() + weight = weight.float() + bias = bias.float() if bias is not None else None + z = z.float() if z is not None else None + + if z is not None and not norm_before_gate: + x = x * F.silu(z) + + if group_size is None: + # Layer norm: subtract mean + mean = x.mean(dim=-1, keepdim=True) + var = ((x - mean).square()).mean(dim=-1, keepdim=True) + rstd = 1 / torch.sqrt(var + eps) + out = (x - mean) * rstd * weight + if bias is not None: + out = out + bias + else: + # Group norm + from einops import rearrange + + x_group = rearrange(x, "... (g d) -> ... g d", d=group_size) + mean = x_group.mean(dim=-1, keepdim=True) + var = ((x_group - mean).square()).mean(dim=-1, keepdim=True) + rstd = 1 / torch.sqrt(var + eps) + x_group = (x_group - mean) * rstd + out = rearrange(x_group, "... g d -> ... (g d)") * weight + if bias is not None: + out = out + bias + + if z is not None and norm_before_gate: + out *= F.silu(z) + + return out.to(dtype) + + +DTYPES = [torch.bfloat16, torch.float32] +# Test various M sizes to ensure rows_per_block logic works correctly +NUM_TOKENS = [ + 1, + 7, + 16, + 63, + 128, + 256, + 512, + 1024, + 2048, + 4096, + 5789, + 8189, + 8191, + 16383, + 32767, +] +HIDDEN_SIZES = [64, 128, 256, 1024] +GROUP_SIZES = [None, 64, 128] # None means full hidden size +NORM_BEFORE_GATE = [True, False] +IS_RMS_NORM = [True, False] +SEEDS = [0, 42] + + +@pytest.mark.parametrize("num_tokens", NUM_TOKENS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM) +@torch.inference_mode() +def test_layer_norm_fwd_basic( + num_tokens: int, + hidden_size: int, + dtype: torch.dtype, + seed: int, + is_rms_norm: bool, +) -> None: + """Test basic layer norm forward pass without z (gate) tensor.""" + current_platform.seed_everything(seed) + device = torch.device("cuda:0") + + # Create inputs + x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device) + weight = torch.randn(hidden_size, dtype=dtype, device=device) + bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device) + eps = 1e-6 + + # Run the triton kernel + out, mean, rstd = layer_norm_fwd( + x, weight, bias, eps, z=None, is_rms_norm=is_rms_norm + ) + + # Run reference implementation + ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=is_rms_norm) + + # Check outputs + assert out.shape == x.shape + assert out.dtype == x.dtype + torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2) + + # Check mean and rstd shapes + if not is_rms_norm: + assert mean.shape == (num_tokens,) + assert rstd.shape == (num_tokens,) + + +@pytest.mark.parametrize("num_tokens", NUM_TOKENS) +@pytest.mark.parametrize("hidden_size", [128, 256, 1024]) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("norm_before_gate", NORM_BEFORE_GATE) +@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM) +@torch.inference_mode() +def test_layer_norm_fwd_with_gate( + num_tokens: int, + hidden_size: int, + dtype: torch.dtype, + norm_before_gate: bool, + is_rms_norm: bool, +) -> None: + """Test layer norm forward pass with z (gate) tensor.""" + current_platform.seed_everything(42) + device = torch.device("cuda:0") + + # Create inputs + x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device) + z = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device) + weight = torch.randn(hidden_size, dtype=dtype, device=device) + bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device) + eps = 1e-6 + + # Run the triton kernel + out, mean, rstd = layer_norm_fwd( + x, + weight, + bias, + eps, + z=z, + norm_before_gate=norm_before_gate, + is_rms_norm=is_rms_norm, + ) + + # Run reference implementation + ref_out = layer_norm_ref( + x, + weight, + bias, + z=z, + eps=eps, + norm_before_gate=norm_before_gate, + is_rms_norm=is_rms_norm, + ) + + # Check outputs + assert out.shape == x.shape + assert out.dtype == x.dtype + torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2) + + +@pytest.mark.parametrize("num_tokens", [128, 512]) +@pytest.mark.parametrize("hidden_size", [512, 1024]) +@pytest.mark.parametrize("group_size", [64, 128, 256]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("is_rms_norm", IS_RMS_NORM) +@torch.inference_mode() +def test_layer_norm_fwd_with_groups( + num_tokens: int, + hidden_size: int, + group_size: int, + dtype: torch.dtype, + is_rms_norm: bool, +) -> None: + """Test layer norm forward pass with group normalization.""" + if hidden_size % group_size != 0: + pytest.skip( + f"hidden_size {hidden_size} not divisible by group_size {group_size}" + ) + + current_platform.seed_everything(42) + device = torch.device("cuda:0") + + # Create inputs + x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device) + weight = torch.randn(hidden_size, dtype=dtype, device=device) + bias = None if is_rms_norm else torch.randn(hidden_size, dtype=dtype, device=device) + eps = 1e-6 + + ngroups = hidden_size // group_size + + # Run the triton kernel + out, mean, rstd = layer_norm_fwd( + x, weight, bias, eps, z=None, group_size=group_size, is_rms_norm=is_rms_norm + ) + + # Run reference implementation + ref_out = layer_norm_ref( + x, weight, bias, z=None, eps=eps, group_size=group_size, is_rms_norm=is_rms_norm + ) + + # Check outputs + assert out.shape == x.shape + assert out.dtype == x.dtype + torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2) + + # Check mean and rstd shapes for groups + if not is_rms_norm: + assert mean.shape == (ngroups * num_tokens,) + assert rstd.shape == (ngroups * num_tokens,) + + +@pytest.mark.parametrize("num_tokens", [7, 63, 128, 513, 1024, 2049]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@torch.inference_mode() +def test_layer_norm_rows_per_block( + num_tokens: int, + dtype: torch.dtype, +) -> None: + """Test that rows_per_block logic works correctly for various M sizes.""" + current_platform.seed_everything(42) + device = torch.device("cuda:0") + hidden_size = 1024 + + # Create inputs + x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device) + weight = torch.randn(hidden_size, dtype=dtype, device=device) + bias = torch.randn(hidden_size, dtype=dtype, device=device) + eps = 1e-6 + + # Run the triton kernel + out, mean, rstd = layer_norm_fwd(x, weight, bias, eps, z=None, is_rms_norm=False) + + # Run reference implementation + ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False) + + # Check outputs + torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2) + + +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@torch.inference_mode() +def test_strided_input(dtype: torch.dtype) -> None: + """Test that the kernel handles non-contiguous (strided) + inputs correctly.""" + current_platform.seed_everything(42) + device = torch.device("cuda:0") + num_tokens = 128 + hidden_size = 1024 + + # Create a larger tensor and take a strided slice + x_large = torch.randn(num_tokens, hidden_size * 2, dtype=dtype, device=device) + x = x_large[:, :hidden_size] + + # Make it contiguous for the kernel + x_contiguous = x.contiguous() + + weight = torch.randn(hidden_size, dtype=dtype, device=device) + bias = torch.randn(hidden_size, dtype=dtype, device=device) + eps = 1e-6 + + # Run the triton kernel with contiguous input + out, mean, rstd = layer_norm_fwd( + x_contiguous, weight, bias, eps, z=None, is_rms_norm=False + ) + + # Run reference implementation + ref_out = layer_norm_ref( + x_contiguous, weight, bias, z=None, eps=eps, is_rms_norm=False + ) + + # Check outputs + torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2) + + +@pytest.mark.parametrize("num_tokens", [1, 128, 2048]) +@pytest.mark.parametrize("hidden_size", [768, 4096]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@torch.inference_mode() +def test_output_buffer_provided( + num_tokens: int, + hidden_size: int, + dtype: torch.dtype, +) -> None: + """Test that the kernel works when an output buffer is provided.""" + current_platform.seed_everything(42) + device = torch.device("cuda:0") + + # Create inputs + x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device) + weight = torch.randn(hidden_size, dtype=dtype, device=device) + bias = torch.randn(hidden_size, dtype=dtype, device=device) + eps = 1e-6 + + # Pre-allocate output buffer + out_buffer = torch.empty_like(x) + + # Run the triton kernel with provided output + out, mean, rstd = layer_norm_fwd( + x, weight, bias, eps, z=None, out=out_buffer, is_rms_norm=False + ) + + # Check that the provided buffer was used + assert out.data_ptr() == out_buffer.data_ptr() + + # Run reference implementation + ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False) + + # Check outputs + torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2) + + +@pytest.mark.parametrize( + "shape", + [ + (4, 16, 1024), # 3D tensor + (2, 8, 512, 256), # 4D tensor + ], +) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@torch.inference_mode() +def test_multidimensional_input( + shape: tuple, + dtype: torch.dtype, +) -> None: + """Test that the autograd function handles multidimensional inputs.""" + current_platform.seed_everything(42) + device = torch.device("cuda:0") + hidden_size = shape[-1] + + # Create inputs + x = torch.randn(*shape, dtype=dtype, device=device) + weight = torch.randn(hidden_size, dtype=dtype, device=device) + bias = torch.randn(hidden_size, dtype=dtype, device=device) + eps = 1e-6 + + # Run through autograd function + out = layernorm_fn(x, weight, bias, z=None, eps=eps) + + # Run reference implementation + ref_out = layer_norm_ref(x, weight, bias, z=None, eps=eps, is_rms_norm=False) + + # Check outputs + assert out.shape == x.shape + torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2) + + +if __name__ == "__main__": + # Run a quick smoke test + test_layer_norm_fwd_basic(128, 1024, torch.float16, 42, False) + test_layer_norm_fwd_with_gate(128, 1024, torch.float16, True, False) + test_layer_norm_rows_per_block(513, torch.float16) + print("All smoke tests passed!") diff --git a/vllm/model_executor/layers/fla/ops/layernorm_guard.py b/vllm/model_executor/layers/fla/ops/layernorm_guard.py index 655cdb3f30eb1..6d039efe58767 100644 --- a/vllm/model_executor/layers/fla/ops/layernorm_guard.py +++ b/vllm/model_executor/layers/fla/ops/layernorm_guard.py @@ -13,6 +13,7 @@ # This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling. # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine. +from functools import lru_cache from typing import Optional import torch @@ -21,6 +22,7 @@ import torch.nn.functional as F from einops import rearrange from vllm.triton_utils import tl, triton +from vllm.utils import cdiv, next_power_of_2 from .utils import input_guard @@ -76,55 +78,103 @@ def layer_norm_fwd_kernel( stride_y_row, stride_z_row, M, # number of rows in X - N, # number of columns in X + N: tl.constexpr, # number of columns in X eps, # epsilon to avoid division by zero BLOCK_N: tl.constexpr, + ROWS_PER_BLOCK: tl.constexpr, HAS_BIAS: tl.constexpr, HAS_Z: tl.constexpr, NORM_BEFORE_GATE: tl.constexpr, IS_RMS_NORM: tl.constexpr, ): - # Map the program id to the row of X and Y it should compute. - row = tl.program_id(0) + # Map the program id to the starting row of X and Y it should compute. + row_start = tl.program_id(0) * ROWS_PER_BLOCK group = tl.program_id(1) - X += row * stride_x_row + group * N - Y += row * stride_y_row + group * N - if HAS_Z: - Z += row * stride_z_row + group * N - if not IS_RMS_NORM: - Mean += group * M - Rstd += group * M - W += group * N - if HAS_BIAS: - B += group * N - # Compute mean and variance + + # Create 2D tile: [ROWS_PER_BLOCK, BLOCK_N] + rows = row_start + tl.arange(0, ROWS_PER_BLOCK) cols = tl.arange(0, BLOCK_N) - x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32) + + # Compute offsets for 2D tile + row_offsets = rows[:, None] * stride_x_row + col_offsets = cols[None, :] + group * N + + # Base pointers + X_base = X + row_offsets + col_offsets + Y_base = Y + rows[:, None] * stride_y_row + col_offsets + + # Create mask for valid rows and columns + row_mask = rows[:, None] < M + col_mask = cols[None, :] < N + mask = row_mask & col_mask + + # Load input data with 2D tile + x = tl.load(X_base, mask=mask, other=0.0).to(tl.float32) + if HAS_Z and not NORM_BEFORE_GATE: - z = tl.load(Z + cols, mask=cols < N).to(tl.float32) + Z_base = Z + rows[:, None] * stride_z_row + col_offsets + z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32) x *= z * tl.sigmoid(z) + + # Compute mean and variance per row (reduce along axis 1) if not IS_RMS_NORM: - mean = tl.sum(x, axis=0) / N - tl.store(Mean + row, mean) - xbar = tl.where(cols < N, x - mean, 0.0) - var = tl.sum(xbar * xbar, axis=0) / N + mean = tl.sum(x, axis=1) / N # Shape: [ROWS_PER_BLOCK] + # Store mean for each row + mean_offsets = group * M + rows + mean_mask = rows < M + tl.store(Mean + mean_offsets, mean, mask=mean_mask) + # Broadcast mean back to 2D for subtraction + xbar = tl.where(mask, x - mean[:, None], 0.0) + var = tl.sum(xbar * xbar, axis=1) / N # Shape: [ROWS_PER_BLOCK] else: - xbar = tl.where(cols < N, x, 0.0) - var = tl.sum(xbar * xbar, axis=0) / N - rstd = 1 / tl.sqrt(var + eps) - tl.store(Rstd + row, rstd) - # Normalize and apply linear transformation - mask = cols < N - w = tl.load(W + cols, mask=mask).to(tl.float32) + xbar = tl.where(mask, x, 0.0) + var = tl.sum(xbar * xbar, axis=1) / N # Shape: [ROWS_PER_BLOCK] + mean = 0.0 # Placeholder for RMS norm + + rstd = tl.rsqrt(var + eps) # Shape: [ROWS_PER_BLOCK] + + # Store rstd for each row + rstd_offsets = group * M + rows + rstd_mask = rows < M + tl.store(Rstd + rstd_offsets, rstd, mask=rstd_mask) + + # Load weights and biases (broadcast across rows) + w_offsets = cols + group * N + w_mask = cols < N + w = tl.load(W + w_offsets, mask=w_mask, other=0.0).to(tl.float32) + if HAS_BIAS: - b = tl.load(B + cols, mask=mask).to(tl.float32) - x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd - y = x_hat * w + b if HAS_BIAS else x_hat * w + b = tl.load(B + w_offsets, mask=w_mask, other=0.0).to(tl.float32) + + # Normalize and apply linear transformation + if not IS_RMS_NORM: + x_hat = (x - mean[:, None]) * rstd[:, None] + else: + x_hat = x * rstd[:, None] + + y = x_hat * w[None, :] + b[None, :] if HAS_BIAS else x_hat * w[None, :] + if HAS_Z and NORM_BEFORE_GATE: - z = tl.load(Z + cols, mask=mask).to(tl.float32) + Z_base = Z + rows[:, None] * stride_z_row + col_offsets + z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32) y *= z * tl.sigmoid(z) + # Write output - tl.store(Y + cols, y, mask=mask) + tl.store(Y_base, y, mask=mask) + + +@lru_cache +def _get_sm_count(device: torch.device) -> int: + """Get and cache the SM count for a given device.""" + props = torch.cuda.get_device_properties(device) + return props.multi_processor_count + + +def calc_rows_per_block(M: int, device: torch.device) -> int: + sm_count = _get_sm_count(device) + rows_per_block = next_power_of_2(cdiv(M, 2 * sm_count)) + rows_per_block = min(rows_per_block, 4) + return rows_per_block def layer_norm_fwd( @@ -171,7 +221,10 @@ def layer_norm_fwd( raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") # heuristics for number of warps num_warps = min(max(BLOCK_N // 256, 1), 8) - grid = (M, ngroups) + # Calculate rows per block based on SM count + rows_per_block = calc_rows_per_block(M, x.device) + # Update grid to use rows_per_block + grid = (cdiv(M, rows_per_block), ngroups) layer_norm_fwd_kernel[grid]( x, out, @@ -187,6 +240,7 @@ def layer_norm_fwd( group_size, eps, BLOCK_N=BLOCK_N, + ROWS_PER_BLOCK=rows_per_block, NORM_BEFORE_GATE=norm_before_gate, IS_RMS_NORM=is_rms_norm, num_warps=num_warps, From 76852017ea9175e1301b34162fa952bf82f02e94 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Sun, 12 Oct 2025 17:29:08 +0800 Subject: [PATCH 20/30] [MISC] Rename the torch profiler filename as instance_id+rank_id for merging the Profiler results of each Rank (#25867) Signed-off-by: wang.yuqi --- vllm/config/vllm.py | 4 ++++ vllm/v1/worker/gpu_worker.py | 3 ++- vllm/v1/worker/xpu_worker.py | 3 ++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 833581035a318..e6cfcad3d6962 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -5,6 +5,7 @@ import copy import hashlib import json import os +import time from contextlib import contextmanager from dataclasses import field, replace from functools import lru_cache @@ -270,6 +271,9 @@ class VllmConfig: def __post_init__(self): """Verify configs are valid & consistent with each other.""" + # To give each torch profile run a unique instance name. + self.instance_id = f"{time.time_ns()}" + self.try_verify_and_update_config() if self.model_config is not None: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 4f4da73fba6e6..119e474b1fca9 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -79,6 +79,7 @@ class Worker(WorkerBase): # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace if envs.VLLM_TORCH_PROFILER_DIR: torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" logger.info( "Profiling enabled. Traces will be saved to: %s", torch_profiler_trace_dir, @@ -101,7 +102,7 @@ class Worker(WorkerBase): with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, use_gzip=True + torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True ), ) else: diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index a1e54628d9ed1..31fa3f3bd6acc 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -39,6 +39,7 @@ class XPUWorker(Worker): # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace if envs.VLLM_TORCH_PROFILER_DIR: torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" logger.info( "Profiling enabled. Traces will be saved to: %s", torch_profiler_trace_dir, @@ -61,7 +62,7 @@ class XPUWorker(Worker): with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, use_gzip=True + torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True ), ) else: From 045b396d090f4a16fbba760bef86e9a24a7ba9ce Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 12 Oct 2025 17:42:42 +0800 Subject: [PATCH 21/30] [Bugfix][CI/Build] Fix failing Mteb CI (#26638) Signed-off-by: Isotr0py --- tests/models/language/pooling_mteb_test/mteb_utils.py | 2 +- tests/models/language/pooling_mteb_test/test_jina.py | 5 +++++ .../models/language/pooling_mteb_test/test_st_projector.py | 1 + tests/models/utils.py | 1 + vllm/model_executor/layers/layernorm.py | 6 +++++- 5 files changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py index d96dc90416855..65ad49fad3653 100644 --- a/tests/models/language/pooling_mteb_test/mteb_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_utils.py @@ -191,7 +191,7 @@ def mteb_test_embed_models( with vllm_runner( model_info.name, runner="pooling", - max_model_len=None, + max_model_len=model_info.max_model_len, **vllm_extra_kwargs, ) as vllm_model: model_config = vllm_model.llm.llm_engine.model_config diff --git a/tests/models/language/pooling_mteb_test/test_jina.py b/tests/models/language/pooling_mteb_test/test_jina.py index 0a712b2542f3c..dbdf82af33c72 100644 --- a/tests/models/language/pooling_mteb_test/test_jina.py +++ b/tests/models/language/pooling_mteb_test/test_jina.py @@ -25,6 +25,11 @@ EMBEDDING_MODELS = [ mteb_score=0.824413164, architecture="XLMRobertaModel", is_matryoshka=True, + # The default max length of the model is 8194, which will crash + # CUDAGraph due to odd length for Gemm. We set it to 8192 to avoid + # avoid this issue. + max_model_len=8192, + dtype="float32", ) ] diff --git a/tests/models/language/pooling_mteb_test/test_st_projector.py b/tests/models/language/pooling_mteb_test/test_st_projector.py index 91b1ef828d0df..74fe4b9bcc03f 100644 --- a/tests/models/language/pooling_mteb_test/test_st_projector.py +++ b/tests/models/language/pooling_mteb_test/test_st_projector.py @@ -23,6 +23,7 @@ ST_PROJECTOR_MODELS = [ architecture="Gemma3TextModel", mteb_score=0.7473819294684156, enable_test=True, + dtype="float32", ), ] diff --git a/tests/models/utils.py b/tests/models/utils.py index 84697ad68d441..3d6e6cb89d62a 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -369,6 +369,7 @@ class ModelInfo: name: str architecture: str = "" dtype: str = "auto" + max_model_len: Optional[int] = None hf_dtype: str = "float32" hf_overrides: Optional[dict[str, Any]] = None default_pooling_type: str = "" diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 6a49ae42ca895..910f145b1f8c2 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -318,7 +318,11 @@ class GemmaRMSNorm(CustomOp): """PyTorch-native implementation equivalent to forward().""" orig_dtype = x.dtype if residual is not None: - x = x + residual.float() if orig_dtype == torch.float16 else x + residual + x = ( + x.float() + residual.float() + if orig_dtype == torch.float16 + else x + residual + ) residual = x x = x.float() From b91d8db873a5f4d639a5cb57288cd94ed1614bb0 Mon Sep 17 00:00:00 2001 From: Jaya Yuan Date: Sun, 12 Oct 2025 17:58:38 +0800 Subject: [PATCH 22/30] [Bugfix][DCP] Set default CUDAGraphMode to PIECEWISE for DCP (#26574) Signed-off-by: FENP <32334296+FENP@users.noreply.github.com> --- vllm/config/vllm.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index e6cfcad3d6962..9d156dd8d9de3 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -350,6 +350,15 @@ class VllmConfig: or self.model_config.is_encoder_decoder ): self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + + # decode context parallel do not support full cudagraphs now. + if self.parallel_config.decode_context_parallel_size > 1: + logger.warning( + "Decode context parallel (DCP) is enabled, which is " + "incompatible with full CUDA graphs. Set " + "cudagraph_mode to PIECEWISE." + ) + self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE else: self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE From 9bb38130cb19eb084d39f269cbeae2952789fafd Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Sun, 12 Oct 2025 06:39:05 -0500 Subject: [PATCH 23/30] [Bugfix] Fix GPU_ID issue in test script (#26442) Signed-off-by: Chendi Xue --- .../nixl_integration/run_accuracy_test.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index 3b0f2d102c1ff..3bf722900df37 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -101,6 +101,12 @@ run_tests_for_model() { for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs GPU_ID=$((i % $(get_num_gpus))) + NEXT_GPU=${GPU_ID} + # If PREFILLER_TP_SIZE is more than 1 + for (( j=1; j < PREFILLER_TP_SIZE; j++ )); do + NEXT_GPU=$(((GPU_ID + j) % $(get_num_gpus))) + GPU_ID="${GPU_ID},${NEXT_GPU}" + done # Calculate port number (base port + instance number) PORT=$((8100 + i)) @@ -136,7 +142,12 @@ run_tests_for_model() { # Start decode instances for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs - GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) + GPU_ID=$(((i + NEXT_GPU + 1) % $(get_num_gpus))) + # If DECODER_TP_SIZE is more than 1 + for (( j=1; j < DECODER_TP_SIZE; j++ )); do + NEXT_GPU=$(((GPU_ID + j) % $(get_num_gpus))) + GPU_ID="${GPU_ID},${NEXT_GPU}" + done # Calculate port number (base port + instance number) PORT=$((8200 + i)) # Calculate side channel port From 8fcaaf6a165e661f63fc51be906bc05b0767332f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sun, 12 Oct 2025 17:51:31 +0100 Subject: [PATCH 24/30] Update `Optional[x]` -> `x | None` and `Union[x, y]` to `x | y` (#26633) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/backend_request_func.py | 27 +- benchmarks/benchmark_prefix_caching.py | 5 +- benchmarks/benchmark_prioritization.py | 3 +- .../benchmark_serving_structured_output.py | 7 +- benchmarks/benchmark_utils.py | 16 +- .../cutlass_benchmarks/sparse_benchmarks.py | 3 +- .../cutlass_benchmarks/w8a8_benchmarks.py | 11 +- .../fused_kernels/layernorm_rms_benchmarks.py | 9 +- .../kernels/bench_per_token_quant_fp8.py | 2 +- .../kernels/benchmark_device_communicators.py | 6 +- benchmarks/kernels/benchmark_lora.py | 21 +- benchmarks/kernels/benchmark_machete.py | 31 +- .../kernels/benchmark_paged_attention.py | 3 +- .../benchmark_per_token_group_quant.py | 2 +- .../kernels/benchmark_reshape_and_cache.py | 2 - .../benchmark_reshape_and_cache_flash.py | 2 - benchmarks/kernels/benchmark_rmsnorm.py | 11 +- benchmarks/kernels/benchmark_rope.py | 3 +- .../benchmark_trtllm_decode_attention.py | 5 +- .../benchmark_trtllm_prefill_attention.py | 5 +- benchmarks/kernels/utils.py | 6 +- benchmarks/multi_turn/bench_dataset.py | 22 +- .../benchmark_serving_multi_turn.py | 10 +- .../multi_turn/convert_sharegpt_to_openai.py | 18 +- .../vllm_cutlass_library_extension.py | 15 +- csrc/quantization/machete/generate.py | 5 +- docs/contributing/model/transcription.md | 14 +- docs/design/logits_processors.md | 6 +- docs/features/custom_logitsprocs.md | 3 +- examples/offline_inference/audio_language.py | 12 +- .../rogue_shared_storage_connector.py | 4 +- .../logits_processor/custom.py | 4 +- .../logits_processor/custom_req.py | 6 +- .../logits_processor/custom_req_init.py | 4 +- .../lora_with_quantization_inference.py | 7 +- .../offline_inference/multilora_inference.py | 6 +- .../prithvi_geospatial_mae.py | 3 +- examples/offline_inference/rlhf_utils.py | 7 +- examples/offline_inference/vision_language.py | 6 +- .../vision_language_multi_image.py | 12 +- .../vision_language_pooling.py | 16 +- .../disagg_proxy_demo.py | 17 +- .../online_serving/kv_events_subscriber.py | 12 +- .../multi_instance_data_parallel.py | 3 +- .../pooling/cohere_rerank_client.py | 4 +- ...ai_chat_embedding_client_for_multimodal.py | 4 +- .../structured_outputs/structured_outputs.py | 10 +- pyproject.toml | 6 - tests/benchmarks/test_random_dataset.py | 6 +- tests/ci_envs.py | 9 +- tests/compile/backend.py | 5 +- tests/compile/piecewise/test_toy_llama.py | 6 +- tests/compile/test_basic_correctness.py | 2 - tests/compile/test_full_graph.py | 6 +- tests/compile/test_fusion_attn.py | 5 +- tests/compile/test_wrapper.py | 7 +- tests/conftest.py | 148 ++-- tests/detokenizer/test_stop_strings.py | 6 +- tests/distributed/conftest.py | 7 +- tests/distributed/test_comm_ops.py | 5 +- tests/distributed/test_context_parallel.py | 6 +- tests/distributed/test_expert_parallel.py | 20 +- tests/distributed/test_pipeline_parallel.py | 8 +- tests/distributed/test_pp_cudagraph.py | 8 +- tests/distributed/test_sequence_parallel.py | 10 +- tests/engine/test_arg_utils.py | 16 +- .../openai/test_async_tokenization.py | 2 +- tests/entrypoints/openai/test_chat.py | 3 +- .../test_completion_with_function_calling.py | 3 +- .../entrypoints/openai/test_lora_resolvers.py | 11 +- tests/entrypoints/openai/test_serving_chat.py | 9 +- .../entrypoints/openai/tool_parsers/utils.py | 9 +- .../openai/test_embedding_dimensions.py | 6 +- .../test_api_server_process_manager.py | 5 +- tests/entrypoints/test_chat_utils.py | 14 +- tests/entrypoints/test_renderer.py | 3 +- tests/evals/gsm8k/gsm8k_eval.py | 13 +- .../attention/test_aiter_flash_attn.py | 11 +- tests/kernels/attention/test_attention.py | 7 +- .../attention/test_cascade_flash_attn.py | 3 +- .../attention/test_cutlass_mla_decode.py | 3 +- tests/kernels/attention/test_flash_attn.py | 17 +- tests/kernels/attention/test_flashinfer.py | 17 +- .../test_flashinfer_trtllm_attention.py | 13 +- .../attention/test_merge_attn_states.py | 3 +- .../test_triton_unified_attention.py | 11 +- .../core/test_fused_quant_layernorm.py | 31 +- tests/kernels/core/test_pos_encoding.py | 4 +- tests/kernels/core/test_rotary_embedding.py | 4 +- tests/kernels/mamba/test_causal_conv1d.py | 21 +- .../moe/modular_kernel_tools/common.py | 34 +- .../make_feature_matrix.py | 5 +- .../moe/modular_kernel_tools/mk_objects.py | 23 +- .../modular_kernel_tools/parallel_utils.py | 11 +- .../profile_modular_kernel.py | 3 +- tests/kernels/moe/parallel_utils.py | 21 +- tests/kernels/moe/test_batched_moe.py | 7 +- .../moe/test_count_expert_num_tokens.py | 3 +- tests/kernels/moe/test_cutlass_moe.py | 23 +- tests/kernels/moe/test_deepep_deepgemm_moe.py | 13 +- tests/kernels/moe/test_deepep_moe.py | 39 +- .../moe/test_modular_kernel_combinations.py | 10 +- tests/kernels/moe/test_moe.py | 8 +- .../kernels/moe/test_moe_align_block_size.py | 4 +- .../kernels/moe/test_moe_permute_unpermute.py | 8 +- tests/kernels/moe/test_ocp_mx_moe.py | 13 +- tests/kernels/moe/test_pplx_cutlass_moe.py | 3 +- tests/kernels/moe/test_pplx_moe.py | 64 +- tests/kernels/moe/utils.py | 89 ++- tests/kernels/quant_utils.py | 29 +- .../kernels/quantization/test_cutlass_w4a8.py | 21 +- tests/kernels/quantization/test_machete_mm.py | 41 +- .../quantization/test_triton_scaled_mm.py | 3 +- tests/kernels/test_onednn.py | 6 +- tests/kernels/utils.py | 110 ++- tests/lora/test_layers.py | 7 +- tests/lora/test_llama_tp.py | 7 +- tests/lora/test_qwen2vl.py | 7 +- tests/lora/test_resolver.py | 3 +- tests/lora/test_utils.py | 4 +- tests/lora/utils.py | 5 +- .../tensorizer_loader/conftest.py | 2 +- .../model_executor/test_enabled_custom_ops.py | 3 +- .../models/language/generation/test_common.py | 3 +- .../models/language/generation/test_hybrid.py | 2 +- .../language/generation_ppl_test/ppl_utils.py | 4 +- tests/models/language/pooling/embed_utils.py | 3 +- .../models/language/pooling/test_embedding.py | 3 +- tests/models/language/pooling/test_gritlm.py | 2 - .../language/pooling_mteb_test/mteb_utils.py | 7 +- .../test_bge_reranker_v2_gemma.py | 4 +- .../generation/test_granite_speech.py | 7 +- .../generation/test_phi4_multimodal.py | 5 +- .../multimodal/generation/test_phi4mm.py | 7 +- .../multimodal/generation/test_pixtral.py | 4 +- .../multimodal/generation/test_qwen2_vl.py | 6 +- .../multimodal/generation/test_whisper.py | 3 +- .../generation/vlm_utils/builders.py | 17 +- .../multimodal/generation/vlm_utils/core.py | 21 +- .../generation/vlm_utils/custom_inputs.py | 2 +- .../generation/vlm_utils/model_utils.py | 21 +- .../multimodal/generation/vlm_utils/types.py | 60 +- .../multimodal/pooling/test_dse_qwen2_vl.py | 2 +- .../pooling/test_jinavl_reranker.py | 5 +- .../multimodal/processing/test_common.py | 5 +- .../multimodal/processing/test_h2ovl.py | 3 +- .../multimodal/processing/test_internvl.py | 3 +- .../multimodal/processing/test_nemotron_vl.py | 3 +- .../processing/test_tensor_schema.py | 16 +- tests/models/quantization/test_awq.py | 3 +- tests/models/registry.py | 20 +- tests/models/test_transformers.py | 8 +- tests/models/utils.py | 38 +- tests/multimodal/test_cache.py | 7 +- tests/multimodal/test_processing.py | 4 +- .../prithvi_io_processor/prithvi_processor.py | 10 +- .../prithvi_io_processor/types.py | 6 +- .../my_gemma_embedding.py | 7 +- .../vllm_add_dummy_model/my_llava.py | 3 +- .../vllm_add_dummy_model/my_opt.py | 3 +- .../vllm_add_dummy_platform/__init__.py | 4 +- tests/quantization/test_blackwell_moe.py | 3 +- tests/quantization/test_compressed_tensors.py | 4 +- tests/quantization/test_quark.py | 5 +- .../test_register_quantization_config.py | 16 +- tests/reasoning/utils.py | 17 +- tests/samplers/test_no_bad_words.py | 8 +- tests/tokenization/test_detokenize.py | 4 +- tests/tokenization/test_tokenizer_registry.py | 18 +- tests/tool_use/mistral/utils.py | 7 +- tests/tool_use/test_jamba_tool_parser.py | 3 +- tests/tool_use/test_parallel_tool_calls.py | 3 +- tests/tool_use/test_qwen3coder_tool_parser.py | 3 +- tests/tool_use/test_seed_oss_tool_parser.py | 3 +- tests/tool_use/test_tool_calls.py | 7 +- tests/tool_use/test_xlam_tool_parser.py | 3 +- tests/tool_use/utils.py | 10 +- .../test_config_parser_registry.py | 7 +- tests/utils.py | 47 +- tests/v1/attention/test_attention_backends.py | 5 +- tests/v1/attention/test_mla_backends.py | 6 +- tests/v1/attention/utils.py | 7 +- tests/v1/core/test_kv_cache_utils.py | 8 +- tests/v1/core/test_prefix_caching.py | 10 +- tests/v1/core/test_scheduler.py | 23 +- tests/v1/core/utils.py | 17 +- tests/v1/distributed/test_async_llm_dp.py | 11 +- tests/v1/distributed/test_internal_lb_dp.py | 6 +- tests/v1/e2e/test_min_tokens.py | 8 +- tests/v1/e2e/test_spec_decode.py | 6 +- tests/v1/engine/test_async_llm.py | 9 +- tests/v1/engine/test_engine_core_client.py | 12 +- tests/v1/engine/test_llm_engine.py | 4 +- tests/v1/engine/test_output_processor.py | 17 +- tests/v1/engine/utils.py | 15 +- .../llm/test_struct_output_generate.py | 4 +- .../v1/entrypoints/openai/test_completion.py | 3 +- tests/v1/executor/test_executor.py | 11 +- .../unit/test_kv_load_failure_recovery.py | 2 +- .../kv_connector/unit/test_nixl_connector.py | 5 +- .../unit/test_output_aggreagator.py | 7 +- tests/v1/kv_connector/unit/utils.py | 13 +- tests/v1/kv_offload/test_cpu_manager.py | 3 +- .../v1/logits_processors/test_correctness.py | 8 +- .../logits_processors/test_custom_offline.py | 4 +- .../logits_processors/test_custom_online.py | 6 +- tests/v1/logits_processors/utils.py | 8 +- tests/v1/sample/test_rejection_sampler.py | 28 +- tests/v1/sample/utils.py | 8 +- tests/v1/spec_decode/test_eagle.py | 3 +- tests/v1/spec_decode/test_tree_attention.py | 3 +- tests/v1/test_serial_utils.py | 3 +- tests/v1/tpu/test_basic.py | 4 +- tests/v1/tpu/test_perf.py | 4 +- tests/v1/tracing/test_tracing.py | 2 - tests/v1/worker/test_gpu_input_batch.py | 3 +- .../v1/worker/test_worker_memory_snapshot.py | 5 +- .../vllm_test_utils/vllm_test_utils/blame.py | 3 +- .../vllm_test_utils/monitor.py | 4 +- tools/check_init_lazy_imports.py | 2 - tools/enforce_regex_import.py | 2 - tools/pre_commit/mypy.py | 5 +- tools/profiler/visualize_layerwise_profile.py | 4 +- vllm/_bc_linter.py | 5 +- vllm/_custom_ops.py | 208 +++--- vllm/_ipex_ops.py | 39 +- vllm/assets/base.py | 3 +- vllm/assets/video.py | 4 +- vllm/attention/backends/abstract.py | 40 +- vllm/attention/backends/registry.py | 5 +- vllm/attention/backends/utils.py | 3 +- vllm/attention/layer.py | 53 +- .../layers/chunked_local_attention.py | 12 +- vllm/attention/layers/cross_attention.py | 5 +- .../layers/encoder_only_attention.py | 5 +- vllm/attention/ops/flashmla.py | 19 +- vllm/attention/ops/merge_attn_states.py | 3 +- vllm/attention/ops/paged_attn.py | 11 +- vllm/attention/ops/rocm_aiter_mla.py | 19 +- vllm/attention/ops/rocm_aiter_paged_attn.py | 3 +- .../attention/ops/triton_merge_attn_states.py | 3 +- vllm/attention/selector.py | 19 +- vllm/attention/utils/fa_utils.py | 3 +- vllm/beam_search.py | 16 +- vllm/benchmarks/datasets.py | 66 +- vllm/benchmarks/latency.py | 4 +- vllm/benchmarks/lib/endpoint_request_func.py | 40 +- vllm/benchmarks/serve.py | 30 +- vllm/benchmarks/throughput.py | 14 +- vllm/compilation/backends.py | 17 +- vllm/compilation/base_static_graph.py | 3 +- vllm/compilation/collective_fusion.py | 19 +- vllm/compilation/compiler_interface.py | 33 +- vllm/compilation/cuda_graph.py | 11 +- vllm/compilation/decorators.py | 23 +- vllm/compilation/fix_functionalization.py | 11 +- vllm/compilation/fx_utils.py | 9 +- vllm/compilation/inductor_pass.py | 15 +- vllm/compilation/noop_elimination.py | 7 +- vllm/compilation/partition_rules.py | 6 +- vllm/compilation/piecewise_backend.py | 3 +- vllm/compilation/sequence_parallelism.py | 7 +- vllm/compilation/torch25_custom_graph_pass.py | 4 +- vllm/compilation/vllm_inductor_pass.py | 4 +- vllm/compilation/wrapper.py | 4 +- vllm/config/cache.py | 20 +- vllm/config/compilation.py | 19 +- vllm/config/device.py | 4 +- vllm/config/kv_events.py | 3 +- vllm/config/kv_transfer.py | 14 +- vllm/config/load.py | 22 +- vllm/config/lora.py | 8 +- vllm/config/model.py | 134 ++-- vllm/config/multimodal.py | 28 +- vllm/config/observability.py | 8 +- vllm/config/parallel.py | 26 +- vllm/config/pooler.py | 22 +- vllm/config/scheduler.py | 4 +- vllm/config/speculative.py | 30 +- vllm/config/speech_to_text.py | 3 +- vllm/config/vllm.py | 32 +- vllm/connections.py | 29 +- vllm/device_allocator/cumem.py | 13 +- vllm/distributed/communication_op.py | 6 +- .../device_communicators/all2all.py | 4 +- .../device_communicators/all_reduce_utils.py | 8 +- .../base_device_communicator.py | 23 +- .../device_communicators/cpu_communicator.py | 24 +- .../device_communicators/cuda_communicator.py | 25 +- .../device_communicators/cuda_wrapper.py | 6 +- .../device_communicators/custom_all_reduce.py | 13 +- .../device_communicators/pynccl.py | 7 +- .../device_communicators/pynccl_allocator.py | 4 +- .../device_communicators/pynccl_wrapper.py | 4 +- .../device_communicators/quick_all_reduce.py | 5 +- .../device_communicators/ray_communicator.py | 12 +- .../device_communicators/shm_broadcast.py | 30 +- .../shm_object_storage.py | 18 +- .../device_communicators/symm_mem.py | 11 +- .../device_communicators/tpu_communicator.py | 5 +- .../device_communicators/xpu_communicator.py | 7 +- vllm/distributed/eplb/eplb_state.py | 15 +- vllm/distributed/eplb/rebalance_execute.py | 3 +- vllm/distributed/kv_events.py | 29 +- .../kv_transfer/kv_connector/factory.py | 3 +- .../kv_transfer/kv_connector/utils.py | 10 +- .../kv_transfer/kv_connector/v1/base.py | 18 +- .../kv_connector/v1/lmcache_connector.py | 8 +- .../kv_transfer/kv_connector/v1/metrics.py | 6 +- .../kv_connector/v1/multi_connector.py | 22 +- .../kv_connector/v1/nixl_connector.py | 32 +- .../kv_connector/v1/offloading_connector.py | 12 +- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 6 +- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 13 +- .../v1/shared_storage_connector.py | 4 +- .../kv_transfer/kv_lookup_buffer/base.py | 9 +- .../kv_lookup_buffer/mooncake_store.py | 7 +- .../kv_lookup_buffer/simple_buffer.py | 11 +- vllm/distributed/kv_transfer/kv_pipe/base.py | 5 +- .../kv_transfer/kv_pipe/mooncake_pipe.py | 13 +- .../kv_transfer/kv_pipe/pynccl_pipe.py | 20 +- .../kv_transfer/kv_transfer_state.py | 6 +- vllm/distributed/parallel_state.py | 87 ++- vllm/distributed/tpu_distributed_utils.py | 6 +- vllm/distributed/utils.py | 6 +- vllm/engine/arg_utils.py | 139 ++-- vllm/engine/metrics.py | 26 +- vllm/engine/protocol.py | 32 +- vllm/entrypoints/api_server.py | 6 +- vllm/entrypoints/chat_utils.py | 211 +++--- vllm/entrypoints/cli/benchmark/main.py | 4 +- vllm/entrypoints/cli/collect_env.py | 4 +- vllm/entrypoints/cli/main.py | 2 - vllm/entrypoints/cli/openai.py | 4 +- vllm/entrypoints/cli/run_batch.py | 4 +- vllm/entrypoints/cli/serve.py | 3 +- vllm/entrypoints/cli/types.py | 4 +- vllm/entrypoints/context.py | 16 +- vllm/entrypoints/harmony_utils.py | 12 +- vllm/entrypoints/launcher.py | 4 +- vllm/entrypoints/llm.py | 234 +++--- vllm/entrypoints/logger.py | 17 +- vllm/entrypoints/openai/api_server.py | 40 +- vllm/entrypoints/openai/cli_args.py | 32 +- vllm/entrypoints/openai/logits_processors.py | 9 +- vllm/entrypoints/openai/protocol.py | 678 +++++++++--------- vllm/entrypoints/openai/run_batch.py | 5 +- vllm/entrypoints/openai/serving_chat.py | 44 +- .../openai/serving_classification.py | 12 +- vllm/entrypoints/openai/serving_completion.py | 32 +- vllm/entrypoints/openai/serving_embedding.py | 36 +- vllm/entrypoints/openai/serving_engine.py | 188 +++-- vllm/entrypoints/openai/serving_models.py | 19 +- vllm/entrypoints/openai/serving_pooling.py | 14 +- vllm/entrypoints/openai/serving_responses.py | 72 +- vllm/entrypoints/openai/serving_score.py | 58 +- .../openai/serving_tokenization.py | 14 +- .../openai/serving_transcription.py | 9 +- vllm/entrypoints/openai/speech_to_text.py | 25 +- .../tool_parsers/abstract_tool_parser.py | 13 +- .../tool_parsers/deepseekv31_tool_parser.py | 5 +- .../tool_parsers/deepseekv3_tool_parser.py | 5 +- .../tool_parsers/glm4_moe_tool_parser.py | 6 +- .../granite_20b_fc_tool_parser.py | 3 +- .../tool_parsers/granite_tool_parser.py | 3 +- .../openai/tool_parsers/hermes_tool_parser.py | 5 +- .../tool_parsers/hunyuan_a13b_tool_parser.py | 6 +- .../tool_parsers/internlm2_tool_parser.py | 3 +- .../openai/tool_parsers/jamba_tool_parser.py | 5 +- .../tool_parsers/kimi_k2_tool_parser.py | 5 +- .../llama4_pythonic_tool_parser.py | 8 +- .../openai/tool_parsers/llama_tool_parser.py | 3 +- .../tool_parsers/minimax_tool_parser.py | 14 +- .../tool_parsers/mistral_tool_parser.py | 5 +- .../openai/tool_parsers/openai_tool_parser.py | 6 +- .../tool_parsers/phi4mini_tool_parser.py | 4 +- .../tool_parsers/pythonic_tool_parser.py | 8 +- .../tool_parsers/qwen3coder_tool_parser.py | 12 +- .../tool_parsers/qwen3xml_tool_parser.py | 18 +- .../tool_parsers/seed_oss_tool_parser.py | 8 +- .../openai/tool_parsers/step3_tool_parser.py | 6 +- vllm/entrypoints/renderer.py | 81 +-- vllm/entrypoints/score_utils.py | 34 +- vllm/entrypoints/ssl.py | 8 +- vllm/entrypoints/tool_server.py | 12 +- vllm/entrypoints/utils.py | 12 +- vllm/envs.py | 85 +-- vllm/executor/executor_base.py | 38 +- vllm/executor/ray_distributed_executor.py | 21 +- vllm/executor/ray_utils.py | 6 +- vllm/executor/uniproc_executor.py | 11 +- vllm/forward_context.py | 32 +- vllm/inputs/data.py | 50 +- vllm/inputs/parse.py | 20 +- vllm/inputs/preprocess.py | 86 ++- vllm/logger.py | 4 +- vllm/logging_utils/dump_input.py | 5 +- vllm/logits_process.py | 12 +- vllm/logprobs.py | 7 +- vllm/lora/layers/base.py | 16 +- vllm/lora/layers/base_linear.py | 13 +- vllm/lora/layers/column_parallel_linear.py | 63 +- vllm/lora/layers/logits_processor.py | 15 +- vllm/lora/layers/replicated_linear.py | 5 +- vllm/lora/layers/row_parallel_linear.py | 15 +- vllm/lora/layers/vocal_parallel_embedding.py | 13 +- vllm/lora/lora_weights.py | 16 +- vllm/lora/models.py | 49 +- .../ops/triton_ops/lora_kernel_metadata.py | 3 +- vllm/lora/peft_helper.py | 12 +- vllm/lora/punica_wrapper/punica_base.py | 48 +- vllm/lora/punica_wrapper/punica_cpu.py | 12 +- vllm/lora/punica_wrapper/punica_gpu.py | 10 +- vllm/lora/punica_wrapper/punica_tpu.py | 16 +- vllm/lora/punica_wrapper/punica_xpu.py | 10 +- vllm/lora/punica_wrapper/utils.py | 6 +- vllm/lora/request.py | 9 +- vllm/lora/resolver.py | 3 +- vllm/lora/utils.py | 8 +- vllm/lora/worker_manager.py | 6 +- vllm/model_executor/custom_op.py | 3 +- vllm/model_executor/layers/activation.py | 3 +- vllm/model_executor/layers/batch_invariant.py | 10 +- vllm/model_executor/layers/fla/ops/chunk.py | 7 +- .../layers/fla/ops/chunk_delta_h.py | 7 +- vllm/model_executor/layers/fla/ops/chunk_o.py | 7 +- .../layers/fla/ops/chunk_scaled_dot_kkt.py | 5 +- vllm/model_executor/layers/fla/ops/cumsum.py | 13 +- .../layers/fla/ops/fused_recurrent.py | 19 +- vllm/model_executor/layers/fla/ops/l2norm.py | 3 +- .../layers/fla/ops/layernorm_guard.py | 13 +- .../layers/fla/ops/solve_tril.py | 3 +- vllm/model_executor/layers/fla/ops/utils.py | 5 +- vllm/model_executor/layers/fla/ops/wy_fast.py | 3 +- .../layers/fused_moe/__init__.py | 6 +- .../layers/fused_moe/batched_deep_gemm_moe.py | 11 +- .../batched_triton_or_deep_gemm_moe.py | 11 +- .../model_executor/layers/fused_moe/config.py | 128 ++-- .../layers/fused_moe/cpu_fused_moe.py | 42 +- .../layers/fused_moe/cutlass_moe.py | 54 +- .../layers/fused_moe/deep_gemm_moe.py | 19 +- .../layers/fused_moe/deep_gemm_utils.py | 15 +- .../fused_moe/deepep_ht_prepare_finalize.py | 28 +- .../fused_moe/deepep_ll_prepare_finalize.py | 20 +- .../fused_moe/flashinfer_cutlass_moe.py | 21 +- .../flashinfer_cutlass_prepare_finalize.py | 9 +- .../layers/fused_moe/flashinfer_trtllm_moe.py | 13 +- .../layers/fused_moe/fused_batched_moe.py | 42 +- .../layers/fused_moe/fused_marlin_moe.py | 74 +- .../layers/fused_moe/fused_moe.py | 169 ++--- .../fused_moe/gpt_oss_triton_kernels_moe.py | 21 +- vllm/model_executor/layers/fused_moe/layer.py | 216 +++--- .../layers/fused_moe/modular_kernel.py | 99 +-- .../layers/fused_moe/moe_align_block_size.py | 3 +- .../layers/fused_moe/moe_permute_unpermute.py | 27 +- .../layers/fused_moe/pplx_prepare_finalize.py | 26 +- .../layers/fused_moe/prepare_finalize.py | 7 +- .../layers/fused_moe/rocm_aiter_fused_moe.py | 51 +- .../layers/fused_moe/routing_simulator.py | 8 +- .../layers/fused_moe/shared_fused_moe.py | 5 +- .../fused_moe/topk_weight_and_reduce.py | 9 +- .../layers/fused_moe/triton_deep_gemm_moe.py | 11 +- .../layers/fused_moe/trtllm_moe.py | 11 +- vllm/model_executor/layers/fused_moe/utils.py | 47 +- vllm/model_executor/layers/layernorm.py | 34 +- vllm/model_executor/layers/lightning_attn.py | 3 +- vllm/model_executor/layers/linear.py | 50 +- .../model_executor/layers/logits_processor.py | 14 +- .../layers/mamba/linear_attn.py | 14 +- .../layers/mamba/mamba_mixer.py | 14 +- .../layers/mamba/mamba_mixer2.py | 20 +- .../layers/mamba/mamba_utils.py | 13 +- .../layers/mamba/ops/causal_conv1d.py | 31 +- .../model_executor/layers/mamba/short_conv.py | 6 +- vllm/model_executor/layers/mla.py | 21 +- vllm/model_executor/layers/pooler.py | 54 +- .../layers/quantization/auto_round.py | 6 +- .../model_executor/layers/quantization/awq.py | 8 +- .../layers/quantization/awq_marlin.py | 29 +- .../layers/quantization/base_config.py | 8 +- .../layers/quantization/bitblas.py | 12 +- .../layers/quantization/bitsandbytes.py | 33 +- .../compressed_tensors/compressed_tensors.py | 26 +- .../compressed_tensors_moe.py | 126 ++-- .../schemes/compressed_tensors_24.py | 11 +- .../schemes/compressed_tensors_scheme.py | 3 +- .../schemes/compressed_tensors_w4a16_24.py | 6 +- .../schemes/compressed_tensors_w4a16_nvfp4.py | 4 +- .../schemes/compressed_tensors_w4a4_nvfp4.py | 4 +- .../schemes/compressed_tensors_w4a8_fp8.py | 10 +- .../schemes/compressed_tensors_w4a8_int.py | 6 +- .../schemes/compressed_tensors_w8a16_fp8.py | 4 +- .../schemes/compressed_tensors_w8a8_fp8.py | 4 +- .../schemes/compressed_tensors_w8a8_int8.py | 4 +- .../schemes/compressed_tensors_wNa16.py | 10 +- .../compressed_tensors/transform/linear.py | 15 +- .../compressed_tensors/transform/module.py | 3 +- .../transform/schemes/linear_qutlass_nvfp4.py | 5 +- .../compressed_tensors/triton_scaled_mm.py | 3 +- .../quantization/compressed_tensors/utils.py | 11 +- .../layers/quantization/deepspeedfp.py | 2 +- .../layers/quantization/experts_int8.py | 23 +- .../layers/quantization/fbgemm_fp8.py | 2 +- .../model_executor/layers/quantization/fp8.py | 37 +- .../layers/quantization/fp_quant.py | 10 +- .../layers/quantization/gguf.py | 31 +- .../layers/quantization/gptq.py | 12 +- .../layers/quantization/gptq_bitblas.py | 6 +- .../layers/quantization/gptq_marlin.py | 33 +- .../layers/quantization/gptq_marlin_24.py | 4 +- .../layers/quantization/hqq_marlin.py | 4 +- .../layers/quantization/input_quant_fp8.py | 13 +- .../layers/quantization/ipex_quant.py | 35 +- .../kernels/mixed_precision/MPLinearKernel.py | 18 +- .../kernels/mixed_precision/__init__.py | 4 +- .../kernels/mixed_precision/allspark.py | 5 +- .../kernels/mixed_precision/bitblas.py | 13 +- .../kernels/mixed_precision/conch.py | 6 +- .../kernels/mixed_precision/cutlass.py | 5 +- .../kernels/mixed_precision/dynamic_4bit.py | 5 +- .../kernels/mixed_precision/exllama.py | 5 +- .../kernels/mixed_precision/machete.py | 5 +- .../kernels/mixed_precision/marlin.py | 5 +- .../kernels/scaled_mm/ScaledMMLinearKernel.py | 11 +- .../kernels/scaled_mm/__init__.py | 3 +- .../quantization/kernels/scaled_mm/aiter.py | 9 +- .../quantization/kernels/scaled_mm/cpu.py | 9 +- .../quantization/kernels/scaled_mm/cutlass.py | 5 +- .../quantization/kernels/scaled_mm/triton.py | 5 +- .../quantization/kernels/scaled_mm/xla.py | 9 +- .../layers/quantization/modelopt.py | 65 +- .../layers/quantization/moe_wna16.py | 27 +- .../layers/quantization/mxfp4.py | 41 +- .../layers/quantization/petit.py | 10 +- .../layers/quantization/ptpc_fp8.py | 4 +- .../layers/quantization/quark/quark.py | 22 +- .../layers/quantization/quark/quark_moe.py | 43 +- .../quark/schemes/quark_ocp_mx.py | 13 +- .../quark/schemes/quark_scheme.py | 3 +- .../quark/schemes/quark_w8a8_fp8.py | 9 +- .../quark/schemes/quark_w8a8_int8.py | 8 +- .../layers/quantization/quark/utils.py | 4 +- .../model_executor/layers/quantization/rtn.py | 25 +- .../layers/quantization/schema.py | 4 +- .../layers/quantization/torchao.py | 4 +- .../layers/quantization/tpu_int8.py | 2 +- .../quantization/utils/bitblas_utils.py | 13 +- .../quantization/utils/flashinfer_fp4_moe.py | 2 - .../quantization/utils/flashinfer_utils.py | 15 +- .../layers/quantization/utils/fp8_utils.py | 40 +- .../layers/quantization/utils/gptq_utils.py | 14 +- .../layers/quantization/utils/int8_utils.py | 8 +- .../layers/quantization/utils/layer_utils.py | 3 +- .../quantization/utils/machete_utils.py | 3 +- .../layers/quantization/utils/marlin_utils.py | 19 +- .../quantization/utils/marlin_utils_fp4.py | 5 +- .../quantization/utils/marlin_utils_fp8.py | 3 +- .../quantization/utils/marlin_utils_test.py | 4 +- .../layers/quantization/utils/mxfp4_utils.py | 19 +- .../layers/quantization/utils/ocp_mx_utils.py | 5 +- .../layers/quantization/utils/petit_utils.py | 8 +- .../layers/quantization/utils/quant_utils.py | 12 +- .../layers/quantization/utils/w8a8_utils.py | 18 +- vllm/model_executor/layers/resampler.py | 20 +- .../layers/rotary_embedding/__init__.py | 8 +- .../layers/rotary_embedding/base.py | 18 +- .../layers/rotary_embedding/common.py | 4 +- .../rotary_embedding/deepseek_scaling_rope.py | 13 +- .../rotary_embedding/dual_chunk_rope.py | 5 +- .../rotary_embedding/ernie45_vl_rope.py | 9 +- .../rotary_embedding/linear_scaling_rope.py | 3 +- .../rotary_embedding/llama4_vision_rope.py | 9 +- .../layers/rotary_embedding/mrope.py | 39 +- .../rotary_embedding/ntk_scaling_rope.py | 3 +- .../phi3_long_rope_scaled_rope.py | 11 +- vllm/model_executor/layers/utils.py | 12 +- .../layers/vocab_parallel_embedding.py | 17 +- vllm/model_executor/model_loader/__init__.py | 4 +- .../model_loader/bitsandbytes_loader.py | 10 +- .../model_loader/default_loader.py | 10 +- .../model_loader/runai_streamer_loader.py | 3 +- .../model_loader/sharded_state_loader.py | 8 +- .../model_executor/model_loader/tensorizer.py | 48 +- .../model_loader/tensorizer_loader.py | 3 +- vllm/model_executor/model_loader/tpu.py | 5 +- vllm/model_executor/model_loader/utils.py | 7 +- .../model_loader/weight_utils.py | 36 +- vllm/model_executor/models/adapters.py | 8 +- vllm/model_executor/models/aimv2.py | 5 +- vllm/model_executor/models/apertus.py | 38 +- vllm/model_executor/models/arcee.py | 28 +- vllm/model_executor/models/arctic.py | 31 +- vllm/model_executor/models/aria.py | 28 +- vllm/model_executor/models/aya_vision.py | 20 +- vllm/model_executor/models/baichuan.py | 27 +- vllm/model_executor/models/bailing_moe.py | 33 +- vllm/model_executor/models/bamba.py | 29 +- vllm/model_executor/models/bert.py | 49 +- vllm/model_executor/models/bert_with_rope.py | 37 +- vllm/model_executor/models/blip.py | 21 +- vllm/model_executor/models/blip2.py | 42 +- vllm/model_executor/models/bloom.py | 25 +- vllm/model_executor/models/chameleon.py | 48 +- vllm/model_executor/models/chatglm.py | 31 +- vllm/model_executor/models/clip.py | 88 +-- vllm/model_executor/models/cohere2_vision.py | 18 +- vllm/model_executor/models/commandr.py | 33 +- vllm/model_executor/models/dbrx.py | 37 +- vllm/model_executor/models/deepseek.py | 32 +- vllm/model_executor/models/deepseek_eagle.py | 5 +- vllm/model_executor/models/deepseek_mtp.py | 13 +- vllm/model_executor/models/deepseek_v2.py | 70 +- vllm/model_executor/models/deepseek_vl2.py | 32 +- vllm/model_executor/models/dots1.py | 32 +- vllm/model_executor/models/dots_ocr.py | 44 +- vllm/model_executor/models/ernie45_moe.py | 34 +- vllm/model_executor/models/ernie45_vl.py | 68 +- vllm/model_executor/models/ernie45_vl_moe.py | 38 +- vllm/model_executor/models/ernie_mtp.py | 9 +- vllm/model_executor/models/exaone.py | 38 +- vllm/model_executor/models/exaone4.py | 32 +- vllm/model_executor/models/falcon.py | 26 +- vllm/model_executor/models/falcon_h1.py | 33 +- vllm/model_executor/models/flex_olmo.py | 6 +- vllm/model_executor/models/fuyu.py | 16 +- vllm/model_executor/models/gemma.py | 35 +- vllm/model_executor/models/gemma2.py | 31 +- vllm/model_executor/models/gemma3.py | 31 +- vllm/model_executor/models/gemma3_mm.py | 22 +- vllm/model_executor/models/gemma3n.py | 47 +- vllm/model_executor/models/glm4.py | 21 +- vllm/model_executor/models/glm4_1v.py | 52 +- vllm/model_executor/models/glm4_moe.py | 36 +- vllm/model_executor/models/glm4_moe_mtp.py | 17 +- vllm/model_executor/models/glm4v.py | 44 +- vllm/model_executor/models/gpt2.py | 29 +- vllm/model_executor/models/gpt_bigcode.py | 25 +- vllm/model_executor/models/gpt_j.py | 25 +- vllm/model_executor/models/gpt_neox.py | 25 +- vllm/model_executor/models/gpt_oss.py | 15 +- vllm/model_executor/models/granite.py | 30 +- vllm/model_executor/models/granite_speech.py | 28 +- vllm/model_executor/models/granitemoe.py | 26 +- .../model_executor/models/granitemoehybrid.py | 33 +- .../model_executor/models/granitemoeshared.py | 17 +- vllm/model_executor/models/gritlm.py | 13 +- vllm/model_executor/models/grok1.py | 31 +- vllm/model_executor/models/h2ovl.py | 53 +- vllm/model_executor/models/hunyuan_v1.py | 46 +- .../models/hyperclovax_vision.py | 50 +- .../models/idefics2_vision_model.py | 21 +- vllm/model_executor/models/idefics3.py | 46 +- vllm/model_executor/models/interfaces.py | 132 ++-- vllm/model_executor/models/interfaces_base.py | 26 +- vllm/model_executor/models/intern_vit.py | 25 +- vllm/model_executor/models/internlm2.py | 34 +- vllm/model_executor/models/internlm2_ve.py | 17 +- vllm/model_executor/models/interns1.py | 42 +- vllm/model_executor/models/interns1_vit.py | 21 +- vllm/model_executor/models/internvl.py | 122 ++-- vllm/model_executor/models/jais.py | 25 +- vllm/model_executor/models/jamba.py | 39 +- vllm/model_executor/models/jina_vl.py | 9 +- vllm/model_executor/models/keye.py | 176 ++--- vllm/model_executor/models/keye_vl1_5.py | 42 +- vllm/model_executor/models/kimi_vl.py | 20 +- vllm/model_executor/models/lfm2.py | 36 +- vllm/model_executor/models/lfm2_moe.py | 36 +- vllm/model_executor/models/llama.py | 38 +- vllm/model_executor/models/llama4.py | 12 +- vllm/model_executor/models/llama4_eagle.py | 11 +- vllm/model_executor/models/llama_eagle.py | 5 +- vllm/model_executor/models/llama_eagle3.py | 17 +- vllm/model_executor/models/llava.py | 56 +- vllm/model_executor/models/llava_next.py | 32 +- .../model_executor/models/llava_next_video.py | 22 +- vllm/model_executor/models/llava_onevision.py | 46 +- vllm/model_executor/models/longcat_flash.py | 27 +- .../models/longcat_flash_mtp.py | 17 +- vllm/model_executor/models/mamba.py | 19 +- vllm/model_executor/models/mamba2.py | 17 +- vllm/model_executor/models/midashenglm.py | 52 +- vllm/model_executor/models/mimo.py | 9 +- vllm/model_executor/models/mimo_mtp.py | 13 +- vllm/model_executor/models/minicpm.py | 42 +- vllm/model_executor/models/minicpm3.py | 12 +- vllm/model_executor/models/minicpm_eagle.py | 15 +- vllm/model_executor/models/minicpmo.py | 30 +- vllm/model_executor/models/minicpmv.py | 74 +- vllm/model_executor/models/minimax_text_01.py | 36 +- vllm/model_executor/models/minimax_vl_01.py | 38 +- vllm/model_executor/models/mistral3.py | 32 +- vllm/model_executor/models/mixtral.py | 33 +- vllm/model_executor/models/mllama4.py | 34 +- vllm/model_executor/models/modernbert.py | 27 +- vllm/model_executor/models/module_mapping.py | 9 +- vllm/model_executor/models/molmo.py | 80 +-- vllm/model_executor/models/moonvit.py | 15 +- vllm/model_executor/models/mpt.py | 25 +- .../model_executor/models/nano_nemotron_vl.py | 90 +-- vllm/model_executor/models/nemotron.py | 36 +- vllm/model_executor/models/nemotron_h.py | 43 +- vllm/model_executor/models/nemotron_nas.py | 34 +- vllm/model_executor/models/nemotron_vl.py | 37 +- vllm/model_executor/models/nvlm_d.py | 7 +- vllm/model_executor/models/olmo.py | 27 +- vllm/model_executor/models/olmo2.py | 15 +- vllm/model_executor/models/olmoe.py | 21 +- vllm/model_executor/models/opt.py | 33 +- vllm/model_executor/models/orion.py | 28 +- vllm/model_executor/models/ovis.py | 22 +- vllm/model_executor/models/ovis2_5.py | 26 +- vllm/model_executor/models/paligemma.py | 26 +- vllm/model_executor/models/persimmon.py | 23 +- vllm/model_executor/models/phi.py | 25 +- vllm/model_executor/models/phi3v.py | 34 +- vllm/model_executor/models/phi4_multimodal.py | 48 +- vllm/model_executor/models/phi4mm.py | 32 +- vllm/model_executor/models/phi4mm_audio.py | 76 +- vllm/model_executor/models/phi4mm_utils.py | 37 +- vllm/model_executor/models/phimoe.py | 33 +- vllm/model_executor/models/pixtral.py | 62 +- vllm/model_executor/models/plamo2.py | 18 +- vllm/model_executor/models/qwen.py | 30 +- vllm/model_executor/models/qwen2.py | 32 +- .../models/qwen2_5_omni_thinker.py | 48 +- vllm/model_executor/models/qwen2_5_vl.py | 64 +- vllm/model_executor/models/qwen2_audio.py | 28 +- vllm/model_executor/models/qwen2_moe.py | 43 +- vllm/model_executor/models/qwen2_rm.py | 7 +- vllm/model_executor/models/qwen2_vl.py | 76 +- vllm/model_executor/models/qwen3.py | 26 +- vllm/model_executor/models/qwen3_moe.py | 39 +- vllm/model_executor/models/qwen3_next.py | 27 +- vllm/model_executor/models/qwen3_next_mtp.py | 11 +- .../models/qwen3_omni_moe_thinker.py | 60 +- vllm/model_executor/models/qwen3_vl.py | 70 +- vllm/model_executor/models/qwen3_vl_moe.py | 11 +- vllm/model_executor/models/qwen_vl.py | 56 +- vllm/model_executor/models/radio.py | 38 +- vllm/model_executor/models/registry.py | 46 +- vllm/model_executor/models/roberta.py | 17 +- vllm/model_executor/models/rvl.py | 3 +- vllm/model_executor/models/seed_oss.py | 29 +- vllm/model_executor/models/siglip.py | 35 +- vllm/model_executor/models/siglip2navit.py | 19 +- vllm/model_executor/models/skyworkr1v.py | 76 +- vllm/model_executor/models/smolvlm.py | 5 +- vllm/model_executor/models/solar.py | 30 +- vllm/model_executor/models/stablelm.py | 25 +- vllm/model_executor/models/starcoder2.py | 25 +- vllm/model_executor/models/step3_text.py | 28 +- vllm/model_executor/models/step3_vl.py | 44 +- vllm/model_executor/models/swin.py | 51 +- vllm/model_executor/models/tarsier.py | 47 +- vllm/model_executor/models/terratorch.py | 32 +- vllm/model_executor/models/transformers.py | 46 +- .../models/transformers_pooling.py | 10 +- vllm/model_executor/models/ultravox.py | 36 +- vllm/model_executor/models/utils.py | 32 +- vllm/model_executor/models/vision.py | 26 +- vllm/model_executor/models/voxtral.py | 38 +- vllm/model_executor/models/whisper.py | 56 +- vllm/model_executor/models/zamba2.py | 42 +- vllm/model_executor/parameter.py | 25 +- vllm/model_executor/utils.py | 4 +- vllm/multimodal/audio.py | 4 +- vllm/multimodal/cache.py | 40 +- vllm/multimodal/evs.py | 3 +- vllm/multimodal/hasher.py | 5 +- vllm/multimodal/image.py | 3 +- vllm/multimodal/inputs.py | 32 +- vllm/multimodal/parse.py | 46 +- vllm/multimodal/processing.py | 131 ++-- vllm/multimodal/profiling.py | 30 +- vllm/multimodal/registry.py | 30 +- vllm/multimodal/utils.py | 24 +- vllm/multimodal/video.py | 4 +- vllm/outputs.py | 32 +- vllm/platforms/__init__.py | 12 +- vllm/platforms/cpu.py | 6 +- vllm/platforms/cuda.py | 11 +- vllm/platforms/interface.py | 42 +- vllm/platforms/rocm.py | 10 +- vllm/platforms/tpu.py | 6 +- vllm/platforms/xpu.py | 8 +- vllm/plugins/__init__.py | 3 +- vllm/plugins/io_processors/__init__.py | 2 - vllm/plugins/io_processors/interface.py | 14 +- .../lora_resolvers/filesystem_resolver.py | 3 +- vllm/pooling_params.py | 18 +- vllm/profiler/layerwise_profile.py | 19 +- vllm/profiler/utils.py | 4 +- vllm/ray/ray_env.py | 7 +- vllm/reasoning/abs_reasoning_parsers.py | 18 +- vllm/reasoning/basic_parsers.py | 7 +- .../reasoning/deepseek_r1_reasoning_parser.py | 3 +- vllm/reasoning/glm4_moe_reasoning_parser.py | 5 +- vllm/reasoning/gptoss_reasoning_parser.py | 5 +- vllm/reasoning/granite_reasoning_parser.py | 7 +- .../hunyuan_a13b_reasoning_parser.py | 5 +- vllm/reasoning/olmo3_reasoning_parser.py | 16 +- vllm/reasoning/qwen3_reasoning_parser.py | 5 +- vllm/reasoning/step3_reasoning_parser.py | 5 +- vllm/sampling_params.py | 102 +-- vllm/scalar_type.py | 13 +- vllm/sequence.py | 20 +- vllm/tracing.py | 7 +- .../chat_templates/registry.py | 11 +- vllm/transformers_utils/config.py | 93 +-- vllm/transformers_utils/config_parser_base.py | 7 +- vllm/transformers_utils/configs/dotsocr.py | 4 +- vllm/transformers_utils/configs/eagle.py | 11 +- vllm/transformers_utils/configs/kimi_vl.py | 5 +- vllm/transformers_utils/configs/lfm2_moe.py | 3 +- vllm/transformers_utils/configs/medusa.py | 5 +- .../transformers_utils/configs/midashenglm.py | 13 +- .../configs/mlp_speculator.py | 3 +- vllm/transformers_utils/configs/ovis.py | 8 +- vllm/transformers_utils/configs/radio.py | 8 +- .../configs/speculators/base.py | 4 +- vllm/transformers_utils/configs/step3_vl.py | 8 +- vllm/transformers_utils/configs/ultravox.py | 10 +- vllm/transformers_utils/detokenizer_utils.py | 5 +- vllm/transformers_utils/dynamic_module.py | 17 +- vllm/transformers_utils/processor.py | 22 +- vllm/transformers_utils/processors/ovis.py | 8 +- vllm/transformers_utils/processors/ovis2_5.py | 18 +- vllm/transformers_utils/runai_utils.py | 5 +- vllm/transformers_utils/s3_utils.py | 6 +- vllm/transformers_utils/tokenizer.py | 18 +- vllm/transformers_utils/tokenizer_base.py | 22 +- vllm/transformers_utils/tokenizers/mistral.py | 30 +- vllm/transformers_utils/utils.py | 14 +- vllm/usage/usage_lib.py | 42 +- vllm/utils/__init__.py | 64 +- vllm/utils/cache.py | 24 +- vllm/utils/deep_gemm.py | 5 +- vllm/utils/flashinfer.py | 5 +- vllm/utils/gc_utils.py | 4 +- vllm/utils/jsontree.py | 57 +- vllm/utils/tensor_schema.py | 22 +- vllm/v1/attention/backends/cpu_attn.py | 58 +- vllm/v1/attention/backends/flash_attn.py | 49 +- vllm/v1/attention/backends/flashinfer.py | 18 +- vllm/v1/attention/backends/flex_attention.py | 43 +- vllm/v1/attention/backends/gdn_attn.py | 29 +- vllm/v1/attention/backends/mamba1_attn.py | 3 +- vllm/v1/attention/backends/mamba2_attn.py | 15 +- vllm/v1/attention/backends/mla/common.py | 69 +- vllm/v1/attention/backends/mla/cutlass_mla.py | 16 +- .../attention/backends/mla/flashattn_mla.py | 18 +- .../attention/backends/mla/flashinfer_mla.py | 18 +- vllm/v1/attention/backends/mla/flashmla.py | 18 +- .../attention/backends/mla/flashmla_sparse.py | 20 +- vllm/v1/attention/backends/mla/indexer.py | 8 +- .../attention/backends/mla/rocm_aiter_mla.py | 24 +- vllm/v1/attention/backends/mla/triton_mla.py | 13 +- vllm/v1/attention/backends/pallas.py | 17 +- vllm/v1/attention/backends/rocm_aiter_fa.py | 29 +- .../backends/rocm_aiter_unified_attn.py | 18 +- vllm/v1/attention/backends/rocm_attn.py | 28 +- vllm/v1/attention/backends/short_conv_attn.py | 9 +- vllm/v1/attention/backends/tree_attn.py | 24 +- vllm/v1/attention/backends/triton_attn.py | 30 +- vllm/v1/attention/backends/utils.py | 22 +- vllm/v1/attention/backends/xformers.py | 18 +- vllm/v1/core/block_pool.py | 14 +- vllm/v1/core/kv_cache_coordinator.py | 5 +- vllm/v1/core/kv_cache_manager.py | 14 +- vllm/v1/core/kv_cache_utils.py | 20 +- vllm/v1/core/sched/async_scheduler.py | 2 - vllm/v1/core/sched/interface.py | 4 +- vllm/v1/core/sched/output.py | 17 +- vllm/v1/core/sched/request_queue.py | 4 +- vllm/v1/core/sched/scheduler.py | 6 +- vllm/v1/core/sched/utils.py | 3 +- vllm/v1/cudagraph_dispatcher.py | 3 +- vllm/v1/engine/__init__.py | 54 +- vllm/v1/engine/async_llm.py | 70 +- vllm/v1/engine/coordinator.py | 5 +- vllm/v1/engine/core.py | 48 +- vllm/v1/engine/core_client.py | 102 ++- vllm/v1/engine/detokenizer.py | 11 +- vllm/v1/engine/llm_engine.py | 42 +- vllm/v1/engine/logprobs.py | 19 +- vllm/v1/engine/output_processor.py | 90 ++- vllm/v1/engine/parallel_sampling.py | 2 +- vllm/v1/engine/processor.py | 34 +- vllm/v1/engine/utils.py | 28 +- vllm/v1/executor/abstract.py | 13 +- vllm/v1/executor/multiproc_executor.py | 31 +- vllm/v1/executor/ray_distributed_executor.py | 5 +- vllm/v1/kv_cache_interface.py | 13 +- vllm/v1/kv_offload/abstract.py | 3 +- vllm/v1/kv_offload/cpu.py | 5 +- vllm/v1/kv_offload/factory.py | 3 +- vllm/v1/kv_offload/lru_manager.py | 5 +- vllm/v1/metrics/loggers.py | 55 +- vllm/v1/metrics/prometheus.py | 3 +- vllm/v1/metrics/ray_wrappers.py | 23 +- vllm/v1/metrics/reader.py | 3 +- vllm/v1/metrics/stats.py | 22 +- vllm/v1/outputs.py | 24 +- vllm/v1/pool/metadata.py | 5 +- vllm/v1/request.py | 44 +- vllm/v1/sample/logits_processor/__init__.py | 16 +- vllm/v1/sample/logits_processor/builtin.py | 18 +- vllm/v1/sample/logits_processor/interface.py | 2 +- vllm/v1/sample/logits_processor/state.py | 18 +- vllm/v1/sample/metadata.py | 15 +- vllm/v1/sample/ops/topk_topp_sampler.py | 27 +- vllm/v1/sample/rejection_sampler.py | 9 +- vllm/v1/sample/sampler.py | 6 +- vllm/v1/sample/tpu/metadata.py | 3 +- vllm/v1/sample/tpu/sampler.py | 6 +- vllm/v1/serial_utils.py | 20 +- vllm/v1/spec_decode/eagle.py | 11 +- vllm/v1/spec_decode/metrics.py | 3 +- vllm/v1/structured_output/__init__.py | 7 +- vllm/v1/structured_output/backend_guidance.py | 10 +- .../backend_lm_format_enforcer.py | 4 +- vllm/v1/structured_output/backend_outlines.py | 2 - vllm/v1/structured_output/backend_types.py | 9 +- vllm/v1/structured_output/backend_xgrammar.py | 2 - vllm/v1/structured_output/request.py | 14 +- vllm/v1/structured_output/utils.py | 7 +- vllm/v1/utils.py | 22 +- vllm/v1/worker/block_table.py | 3 +- vllm/v1/worker/cpu_model_runner.py | 4 +- vllm/v1/worker/cpu_worker.py | 4 +- vllm/v1/worker/dp_utils.py | 11 +- vllm/v1/worker/gpu_input_batch.py | 44 +- vllm/v1/worker/gpu_model_runner.py | 83 ++- vllm/v1/worker/gpu_ubatch_wrapper.py | 9 +- vllm/v1/worker/gpu_worker.py | 18 +- .../worker/kv_connector_model_runner_mixin.py | 7 +- vllm/v1/worker/lora_model_runner_mixin.py | 11 +- vllm/v1/worker/tpu_input_batch.py | 18 +- vllm/v1/worker/tpu_model_runner.py | 14 +- vllm/v1/worker/tpu_worker.py | 7 +- vllm/v1/worker/ubatch_utils.py | 2 +- vllm/v1/worker/utils.py | 10 +- vllm/v1/worker/worker_base.py | 10 +- 944 files changed, 9490 insertions(+), 10121 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index ba7c733be0b25..4021fede72153 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -8,7 +8,6 @@ import sys import time import traceback from dataclasses import dataclass, field -from typing import Optional, Union import aiohttp import huggingface_hub.constants @@ -28,13 +27,13 @@ class RequestFuncInput: prompt_len: int output_len: int model: str - model_name: Optional[str] = None - logprobs: Optional[int] = None - extra_body: Optional[dict] = None - multi_modal_content: Optional[dict | list[dict]] = None + model_name: str | None = None + logprobs: int | None = None + extra_body: dict | None = None + multi_modal_content: dict | list[dict] | None = None ignore_eos: bool = False - language: Optional[str] = None - request_id: Optional[str] = None + language: str | None = None + request_id: str | None = None @dataclass @@ -52,7 +51,7 @@ class RequestFuncOutput: async def async_request_tgi( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") @@ -133,7 +132,7 @@ async def async_request_tgi( async def async_request_trt_llm( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith("generate_stream") @@ -204,7 +203,7 @@ async def async_request_trt_llm( async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("completions", "profile")), ( @@ -267,7 +266,7 @@ async def async_request_deepspeed_mii( async def async_request_openai_completions( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("completions", "profile")), ( @@ -367,7 +366,7 @@ async def async_request_openai_completions( async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith(("chat/completions", "profile")), ( @@ -476,7 +475,7 @@ async def async_request_openai_chat_completions( async def async_request_openai_audio( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: # Lazy import without PlaceholderModule to avoid vllm dep. import soundfile @@ -610,7 +609,7 @@ def get_tokenizer( tokenizer_mode: str = "auto", trust_remote_code: bool = False, **kwargs, -) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: +) -> PreTrainedTokenizer | PreTrainedTokenizerFast: if pretrained_model_name_or_path is not None and not os.path.exists( pretrained_model_name_or_path ): diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index b5e2613de1cd4..d7dc0e991c4d1 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -32,7 +32,6 @@ import dataclasses import json import random import time -from typing import Optional from transformers import PreTrainedTokenizerBase @@ -80,7 +79,7 @@ def sample_requests_from_dataset( num_requests: int, tokenizer: PreTrainedTokenizerBase, input_length_range: tuple[int, int], - fixed_output_len: Optional[int], + fixed_output_len: int | None, ) -> list[Request]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -128,7 +127,7 @@ def sample_requests_from_random( num_requests: int, tokenizer: PreTrainedTokenizerBase, input_length_range: tuple[int, int], - fixed_output_len: Optional[int], + fixed_output_len: int | None, prefix_len: int, ) -> list[Request]: requests = [] diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index bb453791c1862..769f52dbab6ea 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -7,7 +7,6 @@ import dataclasses import json import random import time -from typing import Optional from transformers import AutoTokenizer, PreTrainedTokenizerBase @@ -24,7 +23,7 @@ def sample_requests( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int], + fixed_output_len: int | None, ) -> list[tuple[str, int, int, int]]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 58b9767d09390..059668f1789cc 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -32,7 +32,6 @@ import uuid import warnings from collections.abc import AsyncGenerator from dataclasses import dataclass -from typing import Optional import datasets import numpy as np @@ -316,7 +315,7 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: list[str], selected_percentiles: list[float], - goodput_config_dict: Optional[dict[str, float]] = None, + goodput_config_dict: dict[str, float] | None = None, ) -> tuple[BenchmarkMetrics, list[int]]: actual_output_lens: list[int] = [] total_input = 0 @@ -436,9 +435,9 @@ async def benchmark( selected_percentile_metrics: list[str], selected_percentiles: list[str], ignore_eos: bool, - max_concurrency: Optional[int], + max_concurrency: int | None, structured_output_ratio: float, - goodput_config_dict: Optional[dict[str, float]] = None, + goodput_config_dict: dict[str, float] | None = None, ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 98624abdf49fb..f0d661f9d5349 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -6,7 +6,7 @@ import math import os import time from types import TracebackType -from typing import Any, Optional, Union +from typing import Any def convert_to_pytorch_benchmark_format( @@ -92,7 +92,7 @@ class TimeCollector: def __init__(self, scale: int) -> None: self.cnt: int = 0 self._sum: int = 0 - self._max: Optional[int] = None + self._max: int | None = None self.scale = scale self.start_time: int = time.monotonic_ns() @@ -104,13 +104,13 @@ class TimeCollector: else: self._max = max(self._max, v) - def avg(self) -> Union[float, str]: + def avg(self) -> float | str: return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A" - def max(self) -> Union[float, str]: + def max(self) -> float | str: return self._max / self.scale if self._max else "N/A" - def dump_avg_max(self) -> list[Union[float, str]]: + def dump_avg_max(self) -> list[float | str]: return [self.avg(), self.max()] def __enter__(self) -> None: @@ -118,8 +118,8 @@ class TimeCollector: def __exit__( self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - exc_traceback: Optional[TracebackType], + exc_type: type[BaseException] | None, + exc_value: BaseException | None, + exc_traceback: TracebackType | None, ) -> None: self.collect(time.monotonic_ns() - self.start_time) diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index 9ec270bbd2e98..22fc2678fd1c9 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -6,8 +6,7 @@ import copy import itertools import pickle as pkl import time -from collections.abc import Iterable -from typing import Callable +from collections.abc import Callable, Iterable import torch import torch.utils.benchmark as TBenchmark diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 02f8c593392c4..2deebf3ddb7ae 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -6,8 +6,7 @@ import copy import itertools import pickle as pkl import time -from collections.abc import Iterable -from typing import Callable, Optional +from collections.abc import Callable, Iterable import torch import torch.utils.benchmark as TBenchmark @@ -53,7 +52,7 @@ def bench_int8( n: int, label: str, sub_label: str, - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: """Benchmark INT8-based kernels.""" assert dtype == torch.int8 @@ -108,7 +107,7 @@ def bench_fp8( n: int, label: str, sub_label: str, - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: """Benchmark FP8-based kernels.""" assert dtype == torch.float8_e4m3fn @@ -183,7 +182,7 @@ def bench( n: int, label: str, sub_label: str, - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: if dtype == torch.int8: return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) @@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]): def run( dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]], - bench_kernels: Optional[list[str]] = None, + bench_kernels: list[str] | None = None, ) -> Iterable[TMeasurement]: results = [] for m, k, n in MKNs: diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index 901524214469e..d809bf1db8cbc 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -3,10 +3,9 @@ import pickle as pkl import time -from collections.abc import Iterable +from collections.abc import Callable, Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Optional import torch import torch.utils.benchmark as TBenchmark @@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]: def unfused_int8_impl( rms_norm_layer: RMSNorm, x: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, quant_dtype: torch.dtype, ): # Norm @@ -68,7 +67,7 @@ def unfused_int8_impl( def unfused_fp8_impl( rms_norm_layer: RMSNorm, x: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, quant_dtype: torch.dtype, ): # Norm @@ -85,7 +84,7 @@ def unfused_fp8_impl( def fused_impl( rms_norm_layer: RMSNorm, # this stores the weights x: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, quant_dtype: torch.dtype, ): out, _ = ops.rms_norm_dynamic_per_token_quant( diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/bench_per_token_quant_fp8.py index e08e5680c191e..9a52ea7f47e3a 100644 --- a/benchmarks/kernels/bench_per_token_quant_fp8.py +++ b/benchmarks/kernels/bench_per_token_quant_fp8.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools -from typing import Callable +from collections.abc import Callable from unittest.mock import patch import pandas as pd diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py index 4cbdde5a5b2ca..df06a940e6d41 100644 --- a/benchmarks/kernels/benchmark_device_communicators.py +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -22,8 +22,8 @@ Example: import json import os import time +from collections.abc import Callable from contextlib import nullcontext -from typing import Callable, Optional import torch import torch.distributed as dist @@ -264,12 +264,12 @@ class CommunicatorBenchmark: def benchmark_allreduce_single( self, sequence_length: int, - allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]], + allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None], should_use_fn: Callable[[torch.Tensor], bool], context, num_warmup: int, num_trials: int, - ) -> Optional[float]: + ) -> float | None: """Benchmark method with CUDA graph optimization.""" try: # Create test tensor (2D: sequence_length x hidden_size) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 799b16999873f..39338f3387613 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -6,11 +6,12 @@ import copy import json import pickle import time +from collections.abc import Callable from dataclasses import dataclass from enum import Enum, auto from itertools import product from pathlib import Path -from typing import Any, Callable, Optional +from typing import Any import torch import torch.utils.benchmark as TBenchmark @@ -158,7 +159,7 @@ def ref_group_gemm( seq_lens_cpu: torch.Tensor, prompt_lora_mapping_cpu: torch.Tensor, scaling: float, - add_inputs: Optional[bool], + add_inputs: bool | None, ): """ Torch group gemm reference implementation to test correctness of @@ -316,8 +317,8 @@ class BenchmarkContext: lora_rank: int sort_by_lora_id: bool dtype: torch.dtype - seq_length: Optional[int] = None - num_slices: Optional[int] = None # num_slices for slice based ops + seq_length: int | None = None + num_slices: int | None = None # num_slices for slice based ops def with_seq_length(self, seq_length: int) -> "BenchmarkContext": ctx = copy.copy(self) @@ -561,7 +562,7 @@ class BenchmarkTensors: } def bench_fn_kwargs( - self, op_type: OpType, add_inputs: Optional[bool] = None + self, op_type: OpType, add_inputs: bool | None = None ) -> dict[str, Any]: if op_type.is_shrink_fn(): assert add_inputs is None @@ -575,7 +576,7 @@ class BenchmarkTensors: raise ValueError(f"Unrecognized optype {self}") def test_correctness( - self, op_type: OpType, expand_fn_add_inputs: Optional[bool] + self, op_type: OpType, expand_fn_add_inputs: bool | None ) -> bool: """ Test correctness of op_type implementation against a grouped gemm @@ -611,8 +612,8 @@ def bench_optype( ctx: BenchmarkContext, arg_pool_size: int, op_type: OpType, - cuda_graph_nops: Optional[int] = None, - expand_fn_add_inputs: Optional[bool] = None, + cuda_graph_nops: int | None = None, + expand_fn_add_inputs: bool | None = None, test_correctness: bool = False, ) -> TMeasurement: assert arg_pool_size >= 1 @@ -679,7 +680,7 @@ def bench_torch_mm( ctx: BenchmarkContext, arg_pool_size: int, op_type: OpType, - cuda_graph_nops: Optional[int] = None, + cuda_graph_nops: int | None = None, ) -> TMeasurement: """ Benchmark basic torch.mm as a roofline. @@ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str: """ -def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None): +def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None): compare = TBenchmark.Compare(timers) compare.print() diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 1b1c3b321cce4..e1d5239f5cc97 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -8,10 +8,9 @@ import math import os import pickle as pkl import time -from collections.abc import Iterable +from collections.abc import Callable, Iterable from dataclasses import dataclass from itertools import product -from typing import Callable, Optional import pandas as pd import torch @@ -63,23 +62,23 @@ class BenchmarkTensors: a: torch.Tensor w_q: torch.Tensor - group_size: Optional[int] + group_size: int | None wtype: ScalarType w_g_s: torch.Tensor - w_g_zp: Optional[torch.Tensor] - w_ch_s: Optional[torch.Tensor] - w_tok_s: Optional[torch.Tensor] + w_g_zp: torch.Tensor | None + w_ch_s: torch.Tensor | None + w_tok_s: torch.Tensor | None @dataclass class TypeConfig: act_type: torch.dtype weight_type: ScalarType - output_type: Optional[torch.dtype] - group_scale_type: Optional[torch.dtype] - group_zero_type: Optional[torch.dtype] - channel_scale_type: Optional[torch.dtype] - token_scale_type: Optional[torch.dtype] + output_type: torch.dtype | None + group_scale_type: torch.dtype | None + group_zero_type: torch.dtype | None + channel_scale_type: torch.dtype | None + token_scale_type: torch.dtype | None def rand_data(shape, dtype=torch.float16, scale=1): @@ -93,8 +92,8 @@ def quantize_and_pack( atype: torch.dtype, w: torch.Tensor, wtype: ScalarType, - stype: Optional[torch.dtype], - group_size: Optional[int], + stype: torch.dtype | None, + group_size: int | None, zero_points: bool = False, ): assert wtype.is_integer(), "TODO: support floating point weights" @@ -113,7 +112,7 @@ def quantize_and_pack( def create_bench_tensors( - shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int] + shape: tuple[int, int, int], types: TypeConfig, group_size: int | None ) -> list[BenchmarkTensors]: m, n, k = shape @@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable]) return res -_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None -_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None +_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None +_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None def bench( diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 7e0376c18ecc7..8f9907952d24d 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -3,7 +3,6 @@ import random import time -from typing import Optional import torch @@ -37,7 +36,7 @@ def main( seed: int, do_profile: bool, device: str = "cuda", - kv_cache_dtype: Optional[str] = None, + kv_cache_dtype: str | None = None, ) -> None: current_platform.seed_everything(seed) diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py index 1ccb5e08b3d57..bdc1eb733084e 100644 --- a/benchmarks/kernels/benchmark_per_token_group_quant.py +++ b/benchmarks/kernels/benchmark_per_token_group_quant.py @@ -3,8 +3,8 @@ import argparse import math +from collections.abc import Callable from contextlib import contextmanager -from typing import Callable from unittest.mock import patch import torch diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py index af9841daadf24..d4b564d2ec6c9 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - import random import time diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py index 0aace571064a0..93df14f0d95cc 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - import random import time diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index 4cf633a81358d..d8d7f5bcf9dad 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools -from typing import Optional, Union import torch from flashinfer.norm import fused_add_rmsnorm, rmsnorm @@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module): def forward( self, x: torch.Tensor, - residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + residual: torch.Tensor | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: orig_dtype = x.dtype x = x.to(torch.float32) if residual is not None: @@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module): def rmsnorm_naive( x: torch.Tensor, weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, + residual: torch.Tensor | None = None, eps: float = 1e-6, ): naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps) @@ -65,7 +64,7 @@ def rmsnorm_naive( def rmsnorm_flashinfer( x: torch.Tensor, weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, + residual: torch.Tensor | None = None, eps: float = 1e-6, ): orig_shape = x.shape @@ -89,7 +88,7 @@ def rmsnorm_flashinfer( def rmsnorm_vllm( x: torch.Tensor, weight: torch.Tensor, - residual: Optional[torch.Tensor] = None, + residual: torch.Tensor | None = None, eps: float = 1e-6, ): orig_shape = x.shape diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index b81baf17a8c67..24869c91a8d70 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import accumulate -from typing import Optional import nvtx import torch @@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora( seq_len: int, num_heads: int, head_size: int, - rotary_dim: Optional[int], + rotary_dim: int | None, dtype: torch.dtype, seed: int, device: str, diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py index 6ddab46214577..f7cdc25794cae 100644 --- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py @@ -4,7 +4,6 @@ import csv import os from datetime import datetime -from typing import Optional import flashinfer import torch @@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn): @torch.no_grad() def benchmark_decode( dtype: torch.dtype, - quant_dtypes: tuple[ - Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] - ], + quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], batch_size: int, max_seq_len: int, num_heads: tuple[int, int] = (64, 8), diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py index 131df74c7de1b..7993354475fcc 100644 --- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py +++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py @@ -4,7 +4,6 @@ import csv import os from datetime import datetime -from typing import Optional import flashinfer import torch @@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn): @torch.no_grad() def benchmark_prefill( dtype: torch.dtype, - quant_dtypes: tuple[ - Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] - ], + quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], batch_size: int, max_seq_len: int, num_heads: tuple[int, int] = (64, 8), diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index 4bbb36bb43592..a9af811bbe9ca 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses -from collections.abc import Iterable -from typing import Any, Callable, Optional +from collections.abc import Callable, Iterable +from typing import Any import torch import torch.utils.benchmark as TBenchmark @@ -55,7 +55,7 @@ class Bench: def __init__( self, - cuda_graph_params: Optional[CudaGraphBenchParams], + cuda_graph_params: CudaGraphBenchParams | None, label: str, sub_label: str, description: str, diff --git a/benchmarks/multi_turn/bench_dataset.py b/benchmarks/multi_turn/bench_dataset.py index 67b937930d58c..2674899d1cc56 100644 --- a/benchmarks/multi_turn/bench_dataset.py +++ b/benchmarks/multi_turn/bench_dataset.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from statistics import mean -from typing import Any, NamedTuple, Optional, Union +from typing import Any, NamedTuple import numpy as np # type: ignore import pandas as pd # type: ignore @@ -35,8 +35,8 @@ class Distribution(ABC): class UniformDistribution(Distribution): def __init__( self, - min_val: Union[int, float], - max_val: Union[int, float], + min_val: int | float, + max_val: int | float, is_integer: bool = True, ) -> None: self.min_val = min_val @@ -56,7 +56,7 @@ class UniformDistribution(Distribution): class ConstantDistribution(Distribution): - def __init__(self, value: Union[int, float]) -> None: + def __init__(self, value: int | float) -> None: self.value = value self.max_val = value @@ -68,7 +68,7 @@ class ConstantDistribution(Distribution): class ZipfDistribution(Distribution): - def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: + def __init__(self, alpha: float, max_val: int | None = None) -> None: self.alpha = alpha self.max_val = max_val @@ -83,7 +83,7 @@ class ZipfDistribution(Distribution): class PoissonDistribution(Distribution): - def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: + def __init__(self, alpha: float, max_val: int | None = None) -> None: self.alpha = alpha self.max_val = max_val @@ -100,11 +100,11 @@ class PoissonDistribution(Distribution): class LognormalDistribution(Distribution): def __init__( self, - mean: Optional[float] = None, - sigma: Optional[float] = None, - average: Optional[int] = None, - median_ratio: Optional[float] = None, - max_val: Optional[int] = None, + mean: float | None = None, + sigma: float | None = None, + average: int | None = None, + median_ratio: float | None = None, + max_val: int | None = None, ) -> None: self.average = average self.median_ratio = median_ratio diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index 233ed460fc8d5..2b0a6da60c256 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -13,7 +13,7 @@ from datetime import datetime from enum import Enum from http import HTTPStatus from statistics import mean -from typing import NamedTuple, Union +from typing import NamedTuple import aiohttp # type: ignore import numpy as np # type: ignore @@ -169,7 +169,7 @@ class MovingAverage: class DebugStats: def __init__(self, logger: logging.Logger, window_size: int) -> None: self.logger = logger - self.metrics: dict[str, Union[MovingAverage, MetricStats]] = { + self.metrics: dict[str, MovingAverage | MetricStats] = { "moving_avg_ttft_ms": MovingAverage(window_size), "moving_avg_tpot_ms": MovingAverage(window_size), "ttft_ms": MetricStats(), @@ -636,7 +636,7 @@ async def client_main( if args.verbose: curr_time_sec: float = time.perf_counter() - time_since_last_turn: Union[str, float] = "N/A" + time_since_last_turn: str | float = "N/A" if conv_id in time_of_last_turn: time_since_last_turn = round( curr_time_sec - time_of_last_turn[conv_id], 3 @@ -928,13 +928,13 @@ async def main_mp( f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}" # noqa: E501 ) - rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3) + rps: str | float = round(len(client_metrics) / runtime_sec, 3) if len(client_metrics) < (5 * bench_args.num_clients): # Do not estimate the RPS if the number of samples is very low # (threshold can be tuned if needed) rps = "N/A" - runtime_left_sec: Union[str, float] = round( + runtime_left_sec: str | float = round( (runtime_sec / finished_convs) * (total_convs - finished_convs), 3 ) if percent < 0.05: diff --git a/benchmarks/multi_turn/convert_sharegpt_to_openai.py b/benchmarks/multi_turn/convert_sharegpt_to_openai.py index c3622c99a2e53..fccab4d0ce21a 100644 --- a/benchmarks/multi_turn/convert_sharegpt_to_openai.py +++ b/benchmarks/multi_turn/convert_sharegpt_to_openai.py @@ -13,7 +13,7 @@ import argparse import json import random from statistics import mean -from typing import Any, Optional +from typing import Any import pandas as pd # type: ignore import tqdm # type: ignore @@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool: def content_is_valid( - content: str, min_content_len: Optional[int], max_content_len: Optional[int] + content: str, min_content_len: int | None, max_content_len: int | None ) -> bool: if min_content_len and len(content) < min_content_len: return False @@ -37,7 +37,7 @@ def content_is_valid( def print_stats( - conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None + conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None ) -> None: # Collect statistics stats = [] @@ -109,12 +109,12 @@ def convert_sharegpt_to_openai( seed: int, input_file: str, output_file: str, - max_items: Optional[int], - min_content_len: Optional[int] = None, - max_content_len: Optional[int] = None, - min_turns: Optional[int] = None, - max_turns: Optional[int] = None, - model: Optional[str] = None, + max_items: int | None, + min_content_len: int | None = None, + max_content_len: int | None = None, + min_turns: int | None = None, + max_turns: int | None = None, + model: str | None = None, ) -> None: if min_turns and max_turns: assert min_turns <= max_turns diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index 5e742d0b02932..34fb64c413db2 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum -from typing import Union from cutlass_library import * @@ -22,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum): TmaWarpSpecializedCooperative = enum_auto() -VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = { **DataTypeNames, # type: ignore **{ VLLMDataType.u4b8: "u4b8", @@ -30,7 +29,7 @@ VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = { }, } -VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = { **DataTypeTag, # type: ignore **{ VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t", @@ -38,7 +37,7 @@ VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { }, } -VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = { +VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = { **DataTypeSize, # type: ignore **{ VLLMDataType.u4b8: 4, @@ -46,7 +45,7 @@ VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = { }, } -VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = { VLLMDataType.u4b8: "vllm::kU4B8", VLLMDataType.u8b128: "vllm::kU8B128", DataType.u4: "vllm::kU4", @@ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = { DataType.bf16: "vllm::kBfloat16", } -VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { +VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = { DataType.u8: "at::ScalarType::Byte", DataType.s8: "at::ScalarType::Char", DataType.e4m3: "at::ScalarType::Float8_e4m3fn", @@ -67,9 +66,7 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { DataType.f32: "at::ScalarType::Float", } -VLLMKernelScheduleTag: dict[ - Union[MixedInputKernelScheduleType, KernelScheduleType], str -] = { +VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = { **KernelScheduleTag, # type: ignore **{ MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized", # noqa: E501 diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index d29a199c5d32f..8bd17ba69cec1 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -9,7 +9,6 @@ from collections.abc import Iterable from copy import deepcopy from dataclasses import dataclass, fields from functools import reduce -from typing import Optional, Union import jinja2 from vllm_cutlass_library_extension import ( @@ -259,7 +258,7 @@ class ScheduleConfig: @dataclass(frozen=True) class TypeConfig: a: DataType - b: Union[DataType, VLLMDataType] + b: DataType | VLLMDataType b_group_scale: DataType b_group_zeropoint: DataType b_channel_scale: DataType @@ -280,7 +279,7 @@ class PrepackTypeConfig: class ImplConfig: types: TypeConfig schedules: list[ScheduleConfig] - heuristic: list[tuple[Optional[str], ScheduleConfig]] + heuristic: list[tuple[str | None, ScheduleConfig]] def generate_sch_sig(schedule_config: ScheduleConfig) -> str: diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index 62e58e5c6ac58..4ce748ce1fed4 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -16,7 +16,7 @@ Declare supported languages and capabilities: ??? code "supported_languages and supports_transcription_only" ```python - from typing import ClassVar, Mapping, Optional, Literal + from typing import ClassVar, Mapping, Literal import numpy as np import torch from torch import nn @@ -81,10 +81,10 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt audio: np.ndarray, stt_config: SpeechToTextConfig, model_config: ModelConfig, - language: Optional[str], + language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, - to_language: Optional[str], + to_language: str | None, ) -> PromptType: # Example with a free-form instruction prompt task_word = "Transcribe" if task_type == "transcribe" else "Translate" @@ -117,10 +117,10 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: audio: np.ndarray, stt_config: SpeechToTextConfig, model_config: ModelConfig, - language: Optional[str], + language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, - to_language: Optional[str], + to_language: str | None, ) -> PromptType: if language is None: raise ValueError("Language must be specified") @@ -150,7 +150,7 @@ If your model requires a language and you want a default, override this method ( ??? code "validate_language()" ```python @classmethod - def validate_language(cls, language: Optional[str]) -> Optional[str]: + def validate_language(cls, language: str | None) -> str | None: if language is None: logger.warning( "Defaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field.") @@ -175,7 +175,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics: audio_duration_s: float, stt_config: SpeechToTextConfig, model_config: ModelConfig, - ) -> Optional[int]: + ) -> int | None: # Return None if unknown; otherwise return an estimate. return int(audio_duration_s * stt_config.sample_rate // 320) # example ``` diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md index 20d78ca3aae2c..da61d2a85e466 100644 --- a/docs/design/logits_processors.md +++ b/docs/design/logits_processors.md @@ -174,7 +174,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus from collections.abc import Sequence from dataclasses import dataclass from enum import Enum, auto - from typing import TYPE_CHECKING, Optional + from typing import TYPE_CHECKING import torch @@ -244,7 +244,7 @@ The previous sections alluded to the interfaces which vLLM logits processors mus @abstractmethod def update_state( self, - batch_update: Optional["BatchUpdate"], + batch_update: "BatchUpdate" | None, ) -> None: """Called when there are new output tokens, prior to each forward pass. @@ -274,7 +274,7 @@ A vLLM logits processor must subclass `LogitsProcessor` and define (at minimum) * Return `True` if the logits processor is argmax invariant (never changes what is the highest-logit-value token ID for a given request), `False` if the logits processor may modify argmax * `is_argmax_invariant()` is evaluated once at startup; if `True`, vLLM will skip applying this logits processor in a given step when all requests use greedy sampling -* `update_state(self, batch_update: Optional["BatchUpdate"]) -> None`: +* `update_state(self, batch_update: "BatchUpdate" | None) -> None`: * Consume a `BatchUpdate` data structure representing persistent batch state changes at the beginning of the current engine step * Use the `BatchUpdate` members to update logits processor internal state * **Note:** batch update data structure may be `None`, signaling no change to the batch constituents. In this case, the LogitsProcessor might still want to update its state based on the updated `output_token_ids` lists that it could have retained when they were added. diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md index 201b340c5972c..b8ad53863cd7a 100644 --- a/docs/features/custom_logitsprocs.md +++ b/docs/features/custom_logitsprocs.md @@ -93,7 +93,6 @@ The contrived example below implements a custom logits processor which consumes ??? code "Example custom logits processor definition" ``` python - from typing import Optional import torch from vllm.config import VllmConfig from vllm.sampling_params import SamplingParams @@ -112,7 +111,7 @@ The contrived example below implements a custom logits processor which consumes """Never impacts greedy sampling""" return False - def update_state(self, batch_update: Optional[BatchUpdate]): + def update_state(self, batch_update: BatchUpdate | None): if not batch_update: return diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index a36664e470450..c4eed2037781a 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -10,7 +10,7 @@ on HuggingFace model repository. import os from dataclasses import asdict -from typing import Any, NamedTuple, Optional +from typing import Any, NamedTuple from huggingface_hub import snapshot_download from transformers import AutoTokenizer @@ -30,11 +30,11 @@ question_per_audio_count = { class ModelRequestData(NamedTuple): engine_args: EngineArgs - prompt: Optional[str] = None - prompt_token_ids: Optional[dict[str, list[int]]] = None - multi_modal_data: Optional[dict[str, Any]] = None - stop_token_ids: Optional[list[int]] = None - lora_requests: Optional[list[LoRARequest]] = None + prompt: str | None = None + prompt_token_ids: dict[str, list[int]] | None = None + multi_modal_data: dict[str, Any] | None = None + stop_token_ids: list[int] | None = None + lora_requests: list[LoRARequest] | None = None # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on diff --git a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py b/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py index 0abe7d1612610..5b2acea4c9457 100644 --- a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py +++ b/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py @@ -3,7 +3,7 @@ # ruff: noqa: E501 import logging from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( @@ -81,7 +81,7 @@ class RogueSharedStorageConnector(SharedStorageConnector): def get_finished( self, finished_req_ids: set[str] - ) -> tuple[Optional[set[str]], Optional[set[str]]]: + ) -> tuple[set[str] | None, set[str] | None]: if self._async_load: meta = self._get_connector_metadata() assert isinstance(meta, RogueSharedStorageConnectorMetadata) diff --git a/examples/offline_inference/logits_processor/custom.py b/examples/offline_inference/logits_processor/custom.py index 4112a498f37ab..72e7ce24d7cc8 100644 --- a/examples/offline_inference/logits_processor/custom.py +++ b/examples/offline_inference/logits_processor/custom.py @@ -33,8 +33,6 @@ Output: ' in the hands of the people.\n\nThe future of AI is in the' ------------------------------------------------------------ """ -from typing import Optional - import torch from vllm import LLM, SamplingParams @@ -58,7 +56,7 @@ class DummyLogitsProcessor(LogitsProcessor): def is_argmax_invariant(self) -> bool: return False - def update_state(self, batch_update: Optional[BatchUpdate]): + def update_state(self, batch_update: BatchUpdate | None): process_dict_updates( self.req_info, batch_update, diff --git a/examples/offline_inference/logits_processor/custom_req.py b/examples/offline_inference/logits_processor/custom_req.py index 4c19bb4ce2bae..87cd7473fa9f1 100644 --- a/examples/offline_inference/logits_processor/custom_req.py +++ b/examples/offline_inference/logits_processor/custom_req.py @@ -39,7 +39,7 @@ Output: ' in the hands of the people.\n\nThe future of AI is in the' ------------------------------------------------------------ """ -from typing import Any, Optional +from typing import Any import torch @@ -82,7 +82,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor): def new_req_logits_processor( self, params: SamplingParams, - ) -> Optional[RequestLogitsProcessor]: + ) -> RequestLogitsProcessor | None: """This method returns a new request-level logits processor, customized to the `target_token` value associated with a particular request. @@ -96,7 +96,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor): Returns: `Callable` request logits processor, or None """ - target_token: Optional[Any] = params.extra_args and params.extra_args.get( + target_token: Any | None = params.extra_args and params.extra_args.get( "target_token" ) if target_token is None: diff --git a/examples/offline_inference/logits_processor/custom_req_init.py b/examples/offline_inference/logits_processor/custom_req_init.py index 62947d122e01c..3bb82a786040b 100644 --- a/examples/offline_inference/logits_processor/custom_req_init.py +++ b/examples/offline_inference/logits_processor/custom_req_init.py @@ -41,8 +41,6 @@ which indicates that the logits processor is running. However, on a non-"cuda" device, the first and third requests would not repeat the same token. """ -from typing import Optional - import torch from vllm import LLM, SamplingParams @@ -91,7 +89,7 @@ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor): def new_req_logits_processor( self, params: SamplingParams, - ) -> Optional[RequestLogitsProcessor]: + ) -> RequestLogitsProcessor | None: """This method returns a new request-level logits processor, customized to the `target_token` value associated with a particular request. diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index 00d4cb9eb4c41..dc5c6202fa57b 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -8,7 +8,6 @@ Requires HuggingFace credentials for access. """ import gc -from typing import Optional import torch from huggingface_hub import snapshot_download @@ -19,7 +18,7 @@ from vllm.lora.request import LoRARequest def create_test_prompts( lora_path: str, -) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]: +) -> list[tuple[str, SamplingParams, LoRARequest | None]]: return [ # this is an example of using quantization without LoRA ( @@ -56,7 +55,7 @@ def create_test_prompts( def process_requests( engine: LLMEngine, - test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]], + test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]], ): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 @@ -78,7 +77,7 @@ def process_requests( def initialize_engine( - model: str, quantization: str, lora_repo: Optional[str] + model: str, quantization: str, lora_repo: str | None ) -> LLMEngine: """Initialize the LLMEngine.""" diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index 6040683c68bcd..6c23cf342e06b 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -7,8 +7,6 @@ for offline inference. Requires HuggingFace credentials for access to Llama2. """ -from typing import Optional - from huggingface_hub import snapshot_download from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams @@ -17,7 +15,7 @@ from vllm.lora.request import LoRARequest def create_test_prompts( lora_path: str, -) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]: +) -> list[tuple[str, SamplingParams, LoRARequest | None]]: """Create a list of test prompts with their sampling parameters. 2 requests for base model, 4 requests for the LoRA. We define 2 @@ -68,7 +66,7 @@ def create_test_prompts( def process_requests( engine: LLMEngine, - test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]], + test_prompts: list[tuple[str, SamplingParams, LoRARequest | None]], ): """Continuously process a list of prompts and handle the outputs.""" request_id = 0 diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index 1a5879a6d35f5..2c73ed6aa6083 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -3,7 +3,6 @@ import argparse import datetime import os -from typing import Union import albumentations import numpy as np @@ -160,7 +159,7 @@ def load_example( file_paths: list[str], mean: list[float] = None, std: list[float] = None, - indices: Union[list[int], None] = None, + indices: list[int] | None = None, ): """Build an input example by loading images in *file_paths*. diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py index c0e60b9793407..13def88439ef2 100644 --- a/examples/offline_inference/rlhf_utils.py +++ b/examples/offline_inference/rlhf_utils.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import gc -from typing import Callable, Optional, TypedDict +from collections.abc import Callable +from typing import TypedDict import torch import zmq @@ -71,7 +72,7 @@ class WorkerExtension: def rebuild_ipc( - handle: tuple[Callable, tuple], device_id: Optional[int] = None + handle: tuple[Callable, tuple], device_id: int | None = None ) -> torch.Tensor: func, args = handle list_args = list(args) @@ -109,7 +110,7 @@ class ColocateWorkerExtension: self._zmq_ctx = zmq.Context() socket = self._zmq_ctx.socket(zmq.REP) socket.connect(zmq_handles[self.report_device_id()]) - buffer: Optional[torch.Tensor] = None + buffer: torch.Tensor | None = None while True: payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = ( socket.recv_pyobj() diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 9fd9da3b0855e..1f09dabaf74c8 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -12,7 +12,7 @@ import os import random from contextlib import contextmanager from dataclasses import asdict -from typing import NamedTuple, Optional +from typing import NamedTuple from huggingface_hub import snapshot_download from transformers import AutoTokenizer @@ -28,8 +28,8 @@ from vllm.utils import FlexibleArgumentParser class ModelRequestData(NamedTuple): engine_args: EngineArgs prompts: list[str] - stop_token_ids: Optional[list[int]] = None - lora_requests: Optional[list[LoRARequest]] = None + stop_token_ids: list[int] | None = None + lora_requests: list[LoRARequest] | None = None # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index c37d40a23ac20..accb6c742a2b6 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -9,7 +9,7 @@ using the chat template defined by the model. import os from argparse import Namespace from dataclasses import asdict -from typing import NamedTuple, Optional +from typing import NamedTuple from huggingface_hub import snapshot_download from PIL.Image import Image @@ -41,9 +41,9 @@ class ModelRequestData(NamedTuple): engine_args: EngineArgs prompt: str image_data: list[Image] - stop_token_ids: Optional[list[int]] = None - chat_template: Optional[str] = None - lora_requests: Optional[list[LoRARequest]] = None + stop_token_ids: list[int] | None = None + chat_template: str | None = None + lora_requests: list[LoRARequest] | None = None # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on @@ -1251,7 +1251,7 @@ model_example_map = { } -def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]): +def run_generate(model, question: str, image_urls: list[str], seed: int | None): req_data = model_example_map[model](question, image_urls) engine_args = asdict(req_data.engine_args) | {"seed": args.seed} @@ -1277,7 +1277,7 @@ def run_generate(model, question: str, image_urls: list[str], seed: Optional[int print("-" * 50) -def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]): +def run_chat(model: str, question: str, image_urls: list[str], seed: int | None): req_data = model_example_map[model](question, image_urls) # Disable other modalities to save memory diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/offline_inference/vision_language_pooling.py index 33ffb59014d8f..1ce2cdc436d6a 100644 --- a/examples/offline_inference/vision_language_pooling.py +++ b/examples/offline_inference/vision_language_pooling.py @@ -11,7 +11,7 @@ on HuggingFace model repository. from argparse import Namespace from dataclasses import asdict from pathlib import Path -from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args +from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args from PIL.Image import Image @@ -47,15 +47,15 @@ class TextImagesQuery(TypedDict): QueryModality = Literal["text", "image", "text+image", "text+images"] -Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery] +Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery class ModelRequestData(NamedTuple): engine_args: EngineArgs - prompt: Optional[str] = None - image: Optional[Image] = None - query: Optional[str] = None - documents: Optional[ScoreMultiModalParam] = None + prompt: str | None = None + image: Image | None = None + query: str | None = None + documents: ScoreMultiModalParam | None = None def run_clip(query: Query) -> ModelRequestData: @@ -281,7 +281,7 @@ def get_query(modality: QueryModality): raise ValueError(msg) -def run_encode(model: str, modality: QueryModality, seed: Optional[int]): +def run_encode(model: str, modality: QueryModality, seed: int | None): query = get_query(modality) req_data = model_example_map[model](query) @@ -311,7 +311,7 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): print("-" * 50) -def run_score(model: str, modality: QueryModality, seed: Optional[int]): +def run_score(model: str, modality: QueryModality, seed: int | None): query = get_query(modality) req_data = model_example_map[model](query) diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py index 1df11d9d84957..2b8482ec717af 100644 --- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py +++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py @@ -23,7 +23,7 @@ import logging import os import sys from abc import ABC, abstractmethod -from typing import Callable, Optional +from collections.abc import Callable import aiohttp import requests @@ -49,12 +49,9 @@ class Proxy: decode_instances: list[str], model: str, scheduling_policy: SchedulingPolicy, - custom_create_completion: Optional[ - Callable[[Request], StreamingResponse] - ] = None, - custom_create_chat_completion: Optional[ - Callable[[Request], StreamingResponse] - ] = None, + custom_create_completion: Callable[[Request], StreamingResponse] | None = None, + custom_create_chat_completion: Callable[[Request], StreamingResponse] + | None = None, ): self.prefill_instances = prefill_instances self.decode_instances = decode_instances @@ -348,9 +345,9 @@ class ProxyServer: def __init__( self, args: argparse.Namespace, - scheduling_policy: Optional[SchedulingPolicy] = None, - create_completion: Optional[Callable[[Request], StreamingResponse]] = None, - create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None, + scheduling_policy: SchedulingPolicy | None = None, + create_completion: Callable[[Request], StreamingResponse] | None = None, + create_chat_completion: Callable[[Request], StreamingResponse] | None = None, ): self.validate_parsed_serve_args(args) self.port = args.port diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py index f4b79b5e13020..19f6bd5726102 100644 --- a/examples/online_serving/kv_events_subscriber.py +++ b/examples/online_serving/kv_events_subscriber.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional, Union +from typing import Any import msgspec import zmq @@ -25,16 +25,16 @@ class KVCacheEvent( class BlockStored(KVCacheEvent): block_hashes: list[ExternalBlockHash] - parent_block_hash: Optional[ExternalBlockHash] + parent_block_hash: ExternalBlockHash | None token_ids: list[int] block_size: int - lora_id: Optional[int] - medium: Optional[str] + lora_id: int | None + medium: str | None class BlockRemoved(KVCacheEvent): block_hashes: list[ExternalBlockHash] - medium: Optional[str] + medium: str | None class AllBlocksCleared(KVCacheEvent): @@ -42,7 +42,7 @@ class AllBlocksCleared(KVCacheEvent): class KVEventBatch(EventBatch): - events: list[Union[BlockStored, BlockRemoved, AllBlocksCleared]] + events: list[BlockStored | BlockRemoved | AllBlocksCleared] def process_event(event_batch): diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py index cb230913a422f..b46cea5619671 100644 --- a/examples/online_serving/multi_instance_data_parallel.py +++ b/examples/online_serving/multi_instance_data_parallel.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio -from typing import Optional from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -43,7 +42,7 @@ async def main(): ) prompt = "Who won the 2004 World Series?" - final_output: Optional[RequestOutput] = None + final_output: RequestOutput | None = None async for output in engine_client.generate( prompt=prompt, sampling_params=sampling_params, diff --git a/examples/online_serving/pooling/cohere_rerank_client.py b/examples/online_serving/pooling/cohere_rerank_client.py index 63c9ff9e93980..b32209967be9a 100644 --- a/examples/online_serving/pooling/cohere_rerank_client.py +++ b/examples/online_serving/pooling/cohere_rerank_client.py @@ -8,8 +8,6 @@ Note that `pip install cohere` is needed to run this example. run: vllm serve BAAI/bge-reranker-base """ -from typing import Union - import cohere from cohere import Client, ClientV2 @@ -25,7 +23,7 @@ documents = [ def cohere_rerank( - client: Union[Client, ClientV2], model: str, query: str, documents: list[str] + client: Client | ClientV2, model: str, query: str, documents: list[str] ) -> dict: return client.rerank(model=model, query=query, documents=documents) diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py index 16ac4378c6863..25ab865a4ee43 100644 --- a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py @@ -9,7 +9,7 @@ Refer to each `run_*` function for the command to run the server for that model. import argparse import base64 import io -from typing import Literal, Union +from typing import Literal from openai import OpenAI from openai._types import NOT_GIVEN, NotGiven @@ -29,7 +29,7 @@ def create_chat_embeddings( *, messages: list[ChatCompletionMessageParam], model: str, - encoding_format: Union[Literal["base64", "float"], NotGiven] = NOT_GIVEN, + encoding_format: Literal["base64", "float"] | NotGiven = NOT_GIVEN, ) -> CreateEmbeddingResponse: """ Convenience function for accessing vLLM's Chat Embeddings API, diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py index 3ea6c73e90e8f..02853a95469a6 100644 --- a/examples/online_serving/structured_outputs/structured_outputs.py +++ b/examples/online_serving/structured_outputs/structured_outputs.py @@ -1,21 +1,15 @@ # ruff: noqa: E501 # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from __future__ import annotations - import argparse import asyncio import enum import os -from typing import TYPE_CHECKING, Any, Literal +from typing import Any, Literal import openai import pydantic - -if TYPE_CHECKING: - from openai.types.chat import ChatCompletionChunk - +from openai.types.chat import ChatCompletionChunk ConstraintsFormat = Literal[ "choice", diff --git a/pyproject.toml b/pyproject.toml index 49a7a0b8b1210..eb9bdb593baac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,12 +84,6 @@ ignore = [ "B007", # f-string format "UP032", - # Can remove once 3.10+ is the minimum Python version - "UP007", - "UP027", - "UP035", - "UP038", - "UP045", ] [tool.ruff.format] diff --git a/tests/benchmarks/test_random_dataset.py b/tests/benchmarks/test_random_dataset.py index 90527dbeae28c..68e4afdcbe521 100644 --- a/tests/benchmarks/test_random_dataset.py +++ b/tests/benchmarks/test_random_dataset.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random -from typing import Any, NamedTuple, Optional, cast +from typing import Any, NamedTuple, cast import numpy as np import pytest @@ -185,8 +185,8 @@ def _collect_mm_samples( output_len: int = 5, base_items_per_request: int = 2, num_mm_items_range_ratio: float = 0.0, - limit_mm_per_prompt: Optional[dict[str, int]] = None, - bucket_config: Optional[dict[tuple[int, int, int], float]] = None, + limit_mm_per_prompt: dict[str, int] | None = None, + bucket_config: dict[tuple[int, int, int], float] | None = None, enable_multimodal_chat: bool = False, ) -> list[SampleRequest]: if limit_mm_per_prompt is None: diff --git a/tests/ci_envs.py b/tests/ci_envs.py index d16ecce1ef8dd..596a05b9e5f33 100644 --- a/tests/ci_envs.py +++ b/tests/ci_envs.py @@ -5,13 +5,14 @@ These envs only work for a small part of the tests, fix what you need! """ import os -from typing import TYPE_CHECKING, Any, Callable, Optional +from collections.abc import Callable +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: VLLM_CI_NO_SKIP: bool = False - VLLM_CI_DTYPE: Optional[str] = None - VLLM_CI_HEAD_DTYPE: Optional[str] = None - VLLM_CI_HF_DTYPE: Optional[str] = None + VLLM_CI_DTYPE: str | None = None + VLLM_CI_HEAD_DTYPE: str | None = None + VLLM_CI_HF_DTYPE: str | None = None environment_variables: dict[str, Callable[[], Any]] = { # A model family has many models with the same architecture. diff --git a/tests/compile/backend.py b/tests/compile/backend.py index 36bc832a1329e..ef1fdd4f9daef 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -2,9 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref -from collections.abc import Sequence +from collections.abc import Callable, Sequence from copy import deepcopy -from typing import Callable, Union from torch import fx from torch._ops import OpOverload @@ -44,7 +43,7 @@ class TestBackend: Inductor config is default-initialized from VllmConfig.CompilationConfig. """ - def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], None]]): + def __init__(self, *passes: InductorPass | Callable[[fx.Graph], None]): self.custom_passes = list(passes) compile_config = get_current_vllm_config().compilation_config self.inductor_config = compile_config.inductor_compile_config diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 08f59283a6db5..45317b456af48 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -10,7 +10,7 @@ initialized randomly with a fixed seed. """ from dataclasses import dataclass -from typing import Any, Optional +from typing import Any import pytest import torch @@ -162,7 +162,7 @@ class LlamaDecoderLayer(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, - residual: Optional[torch.Tensor], + residual: torch.Tensor | None, ) -> tuple[torch.Tensor, torch.Tensor]: """ For tractable computation: @@ -217,7 +217,7 @@ class LlamaModel(nn.Module): def forward( self, - input_ids: Optional[torch.Tensor], + input_ids: torch.Tensor | None, positions: torch.Tensor, ) -> torch.Tensor: hidden_states = self.embedding_tokens(input_ids) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 4bcefb30b2e6e..9bfd72260436b 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - import dataclasses import pytest diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 8ccae4cfb9df2..2f3794c90b204 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -1,11 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - import logging import tempfile -from typing import Any, Union +from typing import Any import pytest import torch @@ -217,7 +215,7 @@ def test_inductor_graph_partition_attn_fusion(caplog_vllm): def run_model( - compile_config: Union[int, CompilationConfig], + compile_config: int | CompilationConfig, model: str, model_kwargs: dict[str, Any], ): diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 0f2e3bffbd311..d1ab85cfb875c 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy -from typing import Optional import pytest import torch._dynamo @@ -41,8 +40,8 @@ FP8_DTYPE = current_platform.fp8_dtype() FP4_DTYPE = torch.uint8 # globals needed for string-import custom Dynamo backend field -backend: Optional[TestBackend] = None -backend_unfused: Optional[TestBackend] = None +backend: TestBackend | None = None +backend_unfused: TestBackend | None = None class AttentionQuantPatternModel(torch.nn.Module): diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index 34db5a999cbd8..b2fff822bbbb5 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import torch @@ -10,7 +9,7 @@ from vllm.config import CompilationLevel class MyMod(torch.nn.Module): - def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None): + def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): if cache is not None: return x + cache return x * 2 @@ -24,11 +23,11 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher): compiled_callable, compilation_level=CompilationLevel.DYNAMO_ONCE ) - def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None): + def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None): # this is the function to be compiled return self.model(x, cache) - def __call__(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None): + def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None): # let torch.compile compile twice if len(self.compiled_codes) == 2: dispatch_id = 0 if cache is None else 1 diff --git a/tests/conftest.py b/tests/conftest.py index 4713e12385965..2fde7f97836d6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -21,7 +21,7 @@ import threading from collections.abc import Generator from contextlib import nullcontext from enum import Enum -from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast +from typing import Any, Callable, TypedDict, TypeVar, cast import numpy as np import pytest @@ -68,7 +68,7 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt") _M = TypeVar("_M") -_PromptMultiModalInput = Union[list[_M], list[list[_M]]] +_PromptMultiModalInput = list[_M] | list[list[_M]] PromptImageInput = _PromptMultiModalInput[Image.Image] PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]] @@ -267,7 +267,7 @@ class HfRunner: return "cpu" if current_platform.is_cpu() else current_platform.device_type - def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: + def wrap_device(self, x: _T, device: str | None = None) -> _T: if x is None or isinstance(x, (bool,)): return x @@ -287,14 +287,14 @@ class HfRunner: model_name: str, dtype: str = "auto", *, - model_kwargs: Optional[dict[str, Any]] = None, + model_kwargs: dict[str, Any] | None = None, trust_remote_code: bool = True, is_sentence_transformer: bool = False, is_cross_encoder: bool = False, skip_tokenizer_init: bool = False, auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM, # Set this to avoid hanging issue - default_torch_num_threads: Optional[int] = None, + default_torch_num_threads: int | None = None, ) -> None: init_ctx = ( nullcontext() @@ -319,7 +319,7 @@ class HfRunner: model_name: str, dtype: str = "auto", *, - model_kwargs: Optional[dict[str, Any]] = None, + model_kwargs: dict[str, Any] | None = None, trust_remote_code: bool = True, is_sentence_transformer: bool = False, is_cross_encoder: bool = False, @@ -406,11 +406,11 @@ class HfRunner: def get_inputs( self, - prompts: Union[list[str], list[list[int]]], - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, - ) -> list[Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]]]: + prompts: list[str] | list[list[int]], + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, + ) -> list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]]: if images is not None: assert len(prompts) == len(images) @@ -420,9 +420,7 @@ class HfRunner: if audios is not None: assert len(prompts) == len(audios) - all_inputs: list[ - Union[BatchFeature, BatchEncoding, dict[str, torch.Tensor]] - ] = [] + all_inputs: list[BatchFeature | BatchEncoding | dict[str, torch.Tensor]] = [] for i, prompt in enumerate(prompts): if isinstance(prompt, str): processor_kwargs: dict[str, Any] = { @@ -494,10 +492,10 @@ class HfRunner: def generate( self, - prompts: Union[list[str], list[list[int]]], - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + prompts: list[str] | list[list[int]], + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, **kwargs: Any, ) -> list[tuple[list[list[int]], list[str]]]: all_inputs = self.get_inputs( @@ -522,11 +520,11 @@ class HfRunner: def generate_greedy( self, - prompts: Union[list[str], list[list[int]]], + prompts: list[str] | list[list[int]], max_tokens: int, - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, **kwargs: Any, ) -> list[tuple[list[int], str]]: outputs = self.generate( @@ -546,9 +544,9 @@ class HfRunner: prompts: list[str], beam_width: int, max_tokens: int, - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, ) -> list[tuple[list[list[int]], list[str]]]: outputs = self.generate( prompts, @@ -574,9 +572,9 @@ class HfRunner: self, prompts: list[str], max_tokens: int, - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, **kwargs: Any, ) -> list[list[torch.Tensor]]: all_inputs = self.get_inputs( @@ -624,7 +622,7 @@ class HfRunner: def _hidden_states_to_logprobs( self, hidden_states: tuple[tuple[torch.Tensor, ...], ...], - num_logprobs: Optional[int], + num_logprobs: int | None, ) -> tuple[list[dict[int, float]], int]: seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states) output_len = len(hidden_states) @@ -652,10 +650,10 @@ class HfRunner: self, prompts: list[str], max_tokens: int, - num_logprobs: Optional[int], - images: Optional[PromptImageInput] = None, - audios: Optional[PromptAudioInput] = None, - videos: Optional[PromptVideoInput] = None, + num_logprobs: int | None, + images: PromptImageInput | None = None, + audios: PromptAudioInput | None = None, + videos: PromptVideoInput | None = None, **kwargs: Any, ) -> list[TokensTextLogprobs]: all_inputs = self.get_inputs( @@ -734,20 +732,20 @@ class VllmRunner: model_name: str, runner: RunnerOption = "auto", convert: ConvertOption = "auto", - tokenizer_name: Optional[str] = None, + tokenizer_name: str | None = None, tokenizer_mode: str = "auto", trust_remote_code: bool = True, - seed: Optional[int] = 0, - max_model_len: Optional[int] = 1024, + seed: int | None = 0, + max_model_len: int | None = 1024, dtype: str = "auto", disable_log_stats: bool = True, tensor_parallel_size: int = 1, block_size: int = 16 if not torch.xpu.is_available() else 64, - enable_chunked_prefill: Optional[bool] = False, + enable_chunked_prefill: bool | None = False, swap_space: int = 4, - enforce_eager: Optional[bool] = False, + enforce_eager: bool | None = False, # Set this to avoid hanging issue - default_torch_num_threads: Optional[int] = None, + default_torch_num_threads: int | None = None, **kwargs, ) -> None: init_ctx = ( @@ -785,10 +783,10 @@ class VllmRunner: def get_inputs( self, - prompts: Union[list[str], list[torch.Tensor], list[list[int]]], - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + prompts: list[str] | list[torch.Tensor] | list[list[int]], + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, ) -> list[dict[str, Any]]: if any( x is not None and len(x) != len(prompts) for x in [images, videos, audios] @@ -824,11 +822,11 @@ class VllmRunner: def generate( self, - prompts: Union[list[str], list[torch.Tensor], list[list[int]]], + prompts: list[str] | list[torch.Tensor] | list[list[int]], sampling_params: SamplingParams, - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, **kwargs: Any, ) -> list[tuple[list[list[int]], list[str]]]: inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) @@ -871,11 +869,11 @@ class VllmRunner: self, prompts: list[str], sampling_params: SamplingParams, - images: Optional[PromptImageInput] = None, - audios: Optional[PromptAudioInput] = None, - videos: Optional[PromptVideoInput] = None, + images: PromptImageInput | None = None, + audios: PromptAudioInput | None = None, + videos: PromptVideoInput | None = None, **kwargs: Any, - ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]: + ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]: inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) req_outputs = self.llm.generate( @@ -894,11 +892,11 @@ class VllmRunner: def generate_greedy( self, - prompts: Union[list[str], list[torch.Tensor], list[list[int]]], + prompts: list[str] | list[torch.Tensor] | list[list[int]], max_tokens: int, - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, **kwargs: Any, ) -> list[tuple[list[int], str]]: greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) @@ -916,15 +914,15 @@ class VllmRunner: self, prompts: list[str], max_tokens: int, - num_logprobs: Optional[int], - num_prompt_logprobs: Optional[int] = None, - images: Optional[PromptImageInput] = None, - audios: Optional[PromptAudioInput] = None, - videos: Optional[PromptVideoInput] = None, - stop_token_ids: Optional[list[int]] = None, - stop: Optional[list[str]] = None, + num_logprobs: int | None, + num_prompt_logprobs: int | None = None, + images: PromptImageInput | None = None, + audios: PromptAudioInput | None = None, + videos: PromptVideoInput | None = None, + stop_token_ids: list[int] | None = None, + stop: list[str] | None = None, **kwargs: Any, - ) -> Union[list[TokensTextLogprobs], list[TokensTextLogprobsPromptLogprobs]]: + ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]: greedy_logprobs_params = SamplingParams( temperature=0.0, max_tokens=max_tokens, @@ -957,7 +955,7 @@ class VllmRunner: perplexities = [] for output in outputs: output = cast(TokensTextLogprobsPromptLogprobs, output) - token_datas = cast(list[Optional[dict[int, Logprob]]], output[3]) + token_datas = cast(list[dict[int, Logprob] | None], output[3]) assert token_datas[0] is None token_log_probs = [] for token_data in token_datas[1:]: @@ -976,10 +974,10 @@ class VllmRunner: prompts: list[str], beam_width: int, max_tokens: int, - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, - concurrency_limit: Optional[int] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, + concurrency_limit: int | None = None, ) -> list[tuple[list[list[int]], list[str]]]: inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios) @@ -1002,9 +1000,9 @@ class VllmRunner: def embed( self, prompts: list[str], - images: Optional[PromptImageInput] = None, - videos: Optional[PromptVideoInput] = None, - audios: Optional[PromptAudioInput] = None, + images: PromptImageInput | None = None, + videos: PromptVideoInput | None = None, + audios: PromptAudioInput | None = None, *args, **kwargs, ) -> list[list[float]]: @@ -1023,8 +1021,8 @@ class VllmRunner: def score( self, - text_1: Union[str, list[str]], - text_2: Union[str, list[str]], + text_1: list[str] | str, + text_2: list[str] | str, *args, **kwargs, ) -> list[float]: @@ -1226,8 +1224,8 @@ def _find_free_port() -> int: class LocalAssetServer: address: str port: int - server: Optional[http.server.ThreadingHTTPServer] - thread: Optional[threading.Thread] + server: http.server.ThreadingHTTPServer | None + thread: threading.Thread | None def __init__(self, address: str = "127.0.0.1") -> None: self.address = address diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py index d59b394393e34..6b829c2610359 100644 --- a/tests/detokenizer/test_stop_strings.py +++ b/tests/detokenizer/test_stop_strings.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Any import pytest @@ -15,8 +15,8 @@ def _test_stopping( llm: LLM, expected_output: str, expected_reason: Any, - stop: Optional[list[str]] = None, - stop_token_ids: Optional[list[int]] = None, + stop: list[str] | None = None, + stop_token_ids: list[int] | None = None, include_in_output: bool = False, ) -> None: output = llm.generate( diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py index 47ceb45057c97..9c146a3323d90 100644 --- a/tests/distributed/conftest.py +++ b/tests/distributed/conftest.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random -from typing import Optional, Union import msgspec import msgspec.msgpack @@ -78,8 +77,8 @@ class MockSubscriber: def __init__( self, - pub_endpoints: Union[str, list[str]], - replay_endpoints: Optional[Union[str, list[str]]] = None, + pub_endpoints: str | list[str], + replay_endpoints: str | list[str] | None = None, topic: str = "", decode_type=SampleBatch, ): @@ -111,7 +110,7 @@ class MockSubscriber: self.last_seq = -1 self.decoder = msgspec.msgpack.Decoder(type=decode_type) - def receive_one(self, timeout=1000) -> Union[tuple[int, SampleBatch], None]: + def receive_one(self, timeout=1000) -> tuple[int, SampleBatch] | None: """Receive a single message with timeout""" if not self.sub.poll(timeout): return None diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index c61c4584d8376..ba80ee6fb83ba 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -5,9 +5,8 @@ Run `pytest tests/distributed/test_comm_ops.py`. """ -from __future__ import annotations - -from typing import Any, Callable +from collections.abc import Callable +from typing import Any import pytest import ray diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index 89c2c9f8badeb..149b502a85a75 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node import json import os from dataclasses import dataclass -from typing import Literal, NamedTuple, Optional +from typing import Literal, NamedTuple import pytest @@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple): class CPTestOptions(NamedTuple): multi_node_only: bool - load_format: Optional[str] = None + load_format: str | None = None @dataclass @@ -54,7 +54,7 @@ class CPTestSettings: dcp_base: int = 1, multi_node_only: bool = False, runner: RunnerOption = "auto", - load_format: Optional[str] = None, + load_format: str | None = None, ): parallel_setups = [] for eager_mode_val in [False]: diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py index 8a9ddcd58cfce..0228d42a76a0f 100644 --- a/tests/distributed/test_expert_parallel.py +++ b/tests/distributed/test_expert_parallel.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Literal, NamedTuple, Optional +from typing import Literal, NamedTuple import pytest @@ -22,9 +22,9 @@ class ParallelSetup(NamedTuple): class EPTestOptions(NamedTuple): trust_remote_code: bool - tokenizer_mode: Optional[str] - load_format: Optional[str] = None - hf_overrides: Optional[str] = None + tokenizer_mode: str | None + load_format: str | None = None + hf_overrides: str | None = None @dataclass @@ -40,9 +40,9 @@ class EPTestSettings: tp_base: int = 2, runner: RunnerOption = "auto", trust_remote_code: bool = False, - tokenizer_mode: Optional[str] = None, - load_format: Optional[str] = None, - hf_overrides: Optional[str] = None, + tokenizer_mode: str | None = None, + load_format: str | None = None, + hf_overrides: str | None = None, ): return EPTestSettings( parallel_setups=[ @@ -72,9 +72,9 @@ class EPTestSettings: tp_base: int = 2, runner: RunnerOption = "auto", trust_remote_code: bool = False, - tokenizer_mode: Optional[str] = None, - load_format: Optional[str] = None, - hf_overrides: Optional[str] = None, + tokenizer_mode: str | None = None, + load_format: str | None = None, + hf_overrides: str | None = None, ): return EPTestSettings( parallel_setups=[ diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 43f0c9dd1a85a..24f62cff299a0 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node import json import os from dataclasses import dataclass -from typing import Literal, NamedTuple, Optional +from typing import Literal, NamedTuple import pytest @@ -35,7 +35,7 @@ class ParallelSetup(NamedTuple): class PPTestOptions(NamedTuple): multi_node_only: bool - load_format: Optional[str] = None + load_format: str | None = None @dataclass @@ -52,7 +52,7 @@ class PPTestSettings: pp_base: int = 2, multi_node_only: bool = False, runner: RunnerOption = "auto", - load_format: Optional[str] = None, + load_format: str | None = None, ): return PPTestSettings( parallel_setups=[ @@ -76,7 +76,7 @@ class PPTestSettings: pp_base: int = 2, runner: RunnerOption = "auto", multi_node_only: bool = False, - load_format: Optional[str] = None, + load_format: str | None = None, ): return PPTestSettings( parallel_setups=[ diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py index 2c9f474640088..2f2b43cb4cc2b 100644 --- a/tests/distributed/test_pp_cudagraph.py +++ b/tests/distributed/test_pp_cudagraph.py @@ -1,16 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - -from typing import TYPE_CHECKING - import pytest +from typing_extensions import LiteralString from ..utils import compare_two_settings, create_new_process_for_each_test -if TYPE_CHECKING: - from typing_extensions import LiteralString - @pytest.mark.parametrize( "PP_SIZE, MODEL_NAME", diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index 0847687cf2f9a..a431bf30fc890 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -11,7 +11,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node import json import os from dataclasses import dataclass -from typing import Literal, NamedTuple, Optional +from typing import Literal, NamedTuple import pytest @@ -36,7 +36,7 @@ class ParallelSetup(NamedTuple): class SPTestOptions(NamedTuple): multi_node_only: bool - load_format: Optional[str] = None + load_format: str | None = None @dataclass @@ -53,7 +53,7 @@ class SPTestSettings: pp_base: int = 1, multi_node_only: bool = False, runner: RunnerOption = "auto", - load_format: Optional[str] = None, + load_format: str | None = None, ): parallel_setups = [] for eager_mode_val in [False, True]: @@ -84,7 +84,7 @@ class SPTestSettings: pp_base: int = 1, runner: RunnerOption = "auto", multi_node_only: bool = False, - load_format: Optional[str] = None, + load_format: str | None = None, ): parallel_setups = [] for eager_mode_val in [False, True]: @@ -115,7 +115,7 @@ class SPTestSettings: pp_base: int = 1, runner: RunnerOption = "auto", multi_node_only: bool = False, - load_format: Optional[str] = None, + load_format: str | None = None, ): parallel_setups = [] for fusion_val in [False, True]: diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 9d367349fc2e5..78928a53942f9 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -5,7 +5,7 @@ import json from argparse import ArgumentError from contextlib import nullcontext from dataclasses import dataclass, field -from typing import Annotated, Literal, Optional, Union +from typing import Annotated, Literal import pytest @@ -115,9 +115,9 @@ class NestedConfig: class DummyConfig: regular_bool: bool = True """Regular bool with default True""" - optional_bool: Optional[bool] = None + optional_bool: bool | None = None """Optional bool with default None""" - optional_literal: Optional[Literal["x", "y"]] = None + optional_literal: Literal["x", "y"] | None = None """Optional literal with default None""" tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3)) """Tuple with variable length""" @@ -127,7 +127,7 @@ class DummyConfig: """List with variable length""" list_literal: list[Literal[1, 2]] = field(default_factory=list) """List with literal choices""" - list_union: list[Union[str, type[object]]] = field(default_factory=list) + list_union: list[str | type[object]] = field(default_factory=list) """List with union type""" literal_literal: Literal[Literal[1], Literal[2]] = 1 """Literal of literals with default 1""" @@ -152,11 +152,11 @@ def test_is_not_builtin(type_hint, expected): ("type_hint", "expected"), [ (Annotated[int, "annotation"], {int}), - (Optional[int], {int, type(None)}), - (Annotated[Optional[int], "annotation"], {int, type(None)}), - (Optional[Annotated[int, "annotation"]], {int, type(None)}), + (int | None, {int, type(None)}), + (Annotated[int | None, "annotation"], {int, type(None)}), + (Annotated[int, "annotation"] | None, {int, type(None)}), ], - ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"], + ids=["Annotated", "or_None", "Annotated_or_None", "or_None_Annotated"], ) def test_get_type_hints(type_hint, expected): assert get_type_hints(type_hint) == expected diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py index 5df859df42da7..682420a83a442 100644 --- a/tests/entrypoints/openai/test_async_tokenization.py +++ b/tests/entrypoints/openai/test_async_tokenization.py @@ -3,7 +3,7 @@ import asyncio import random -from typing import Callable +from collections.abc import Callable import openai import pytest diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 14181c6b8b16b..fa8ae55d14a23 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -3,7 +3,6 @@ # imports for structured outputs tests import json -from typing import Optional import jsonschema import openai # use the official client for correctness check @@ -176,7 +175,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, model_name: st [(MODEL_NAME, 1), (MODEL_NAME, 0), (MODEL_NAME, -1), (MODEL_NAME, None)], ) async def test_prompt_logprobs_chat( - client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: Optional[int] + client: openai.AsyncOpenAI, model_name: str, prompt_logprobs: int | None ): params: dict = { "messages": [ diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index e64f68cad7c83..44d4176655375 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime -from typing import Union import openai # use the official client for correctness check import pytest @@ -166,7 +165,7 @@ async def test_function_tool_use( client: openai.AsyncOpenAI, model_name: str, stream: bool, - tool_choice: Union[str, dict], + tool_choice: str | dict, enable_thinking: bool, ): if not stream: diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index aa4ee603647e4..a85418d5b5f4e 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -4,7 +4,6 @@ from contextlib import suppress from dataclasses import dataclass, field from http import HTTPStatus -from typing import Optional from unittest.mock import AsyncMock, MagicMock import pytest @@ -38,13 +37,13 @@ class MockModelConfig: trust_remote_code: bool = False tokenizer_mode: str = "auto" max_model_len: int = 100 - tokenizer_revision: Optional[str] = None + tokenizer_revision: str | None = None multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig) hf_config: MockHFConfig = field(default_factory=MockHFConfig) - logits_processor_pattern: Optional[str] = None - diff_sampling_param: Optional[dict] = None + logits_processor_pattern: str | None = None + diff_sampling_param: dict | None = None allowed_local_media_path: str = "" - allowed_media_domains: Optional[list[str]] = None + allowed_media_domains: list[str] | None = None encoder_config = None generation_config: str = "auto" skip_tokenizer_init: bool = False @@ -56,7 +55,7 @@ class MockModelConfig: class MockLoRAResolver(LoRAResolver): async def resolve_lora( self, base_model_name: str, lora_name: str - ) -> Optional[LoRARequest]: + ) -> LoRARequest | None: if lora_name == "test-lora": return LoRARequest( lora_name="test-lora", diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 10224dee0efe8..d1367b4eeaf62 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -1,16 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from __future__ import annotations - import asyncio from contextlib import suppress from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any +from typing import Any from unittest.mock import AsyncMock, MagicMock import pytest import pytest_asyncio +from openai import OpenAI from vllm.config.multimodal import MultiModalConfig from vllm.entrypoints.openai.protocol import ChatCompletionRequest @@ -21,9 +19,6 @@ from vllm.v1.engine.async_llm import AsyncLLM from ...utils import RemoteOpenAIServer -if TYPE_CHECKING: - from openai import OpenAI - GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b" diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py index cfa4d3584e709..7489a406224a5 100644 --- a/tests/entrypoints/openai/tool_parsers/utils.py +++ b/tests/entrypoints/openai/tool_parsers/utils.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable -from typing import Union from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, @@ -84,10 +83,10 @@ class StreamingToolReconstructor: def run_tool_extraction( tool_parser: ToolParser, model_output: str, - request: Union[ChatCompletionRequest, None] = None, + request: ChatCompletionRequest | None = None, streaming: bool = False, assert_one_tool_per_delta: bool = True, -) -> tuple[Union[str, None], list[ToolCall]]: +) -> tuple[str | None, list[ToolCall]]: if streaming: reconstructor = run_tool_extraction_streaming( tool_parser, @@ -105,7 +104,7 @@ def run_tool_extraction( def run_tool_extraction_nonstreaming( tool_parser: ToolParser, model_output: str, - request: Union[ChatCompletionRequest, None] = None, + request: ChatCompletionRequest | None = None, ) -> ExtractedToolCallInformation: request = request or ChatCompletionRequest(messages=[], model="test-model") return tool_parser.extract_tool_calls(model_output, request) @@ -114,7 +113,7 @@ def run_tool_extraction_nonstreaming( def run_tool_extraction_streaming( tool_parser: ToolParser, model_deltas: Iterable[str], - request: Union[ChatCompletionRequest, None] = None, + request: ChatCompletionRequest | None = None, assert_one_tool_per_delta: bool = True, ) -> StreamingToolReconstructor: request = request or ChatCompletionRequest(messages=[], model="test-model") diff --git a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py index 92df43d7dbdcf..ba9fb64262772 100644 --- a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py +++ b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py @@ -4,8 +4,6 @@ Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`. """ -from typing import Optional - import openai import pytest @@ -103,14 +101,14 @@ async def test_matryoshka( run_embedding_correctness_test(hf_model, prompts, vllm_outputs, dimensions) if model_info.is_matryoshka: - valid_dimensions: list[Optional[int]] = [None] + valid_dimensions: list[int | None] = [None] if model_info.matryoshka_dimensions is not None: valid_dimensions += model_info.matryoshka_dimensions[:2] for dimensions in valid_dimensions: await make_request_and_correctness_test(dimensions) - invalid_dimensions: list[Optional[int]] = [-1] + invalid_dimensions: list[int | None] = [-1] if model_info.matryoshka_dimensions is not None: assert 5 not in model_info.matryoshka_dimensions invalid_dimensions.append(5) diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py index e548f52e1e94d..3fadbf2ef0dd0 100644 --- a/tests/entrypoints/test_api_server_process_manager.py +++ b/tests/entrypoints/test_api_server_process_manager.py @@ -5,7 +5,6 @@ import multiprocessing import socket import threading import time -from typing import Optional from unittest.mock import patch import pytest @@ -105,7 +104,7 @@ def test_wait_for_completion_or_failure(api_server_args): assert len(manager.processes) == 3 # Create a result capture for the thread - result: dict[str, Optional[Exception]] = {"exception": None} + result: dict[str, Exception | None] = {"exception": None} def run_with_exception_capture(): try: @@ -218,7 +217,7 @@ def test_external_process_monitoring(api_server_args): assert len(manager.processes) == 3 # Create a result capture for the thread - result: dict[str, Optional[Exception]] = {"exception": None} + result: dict[str, Exception | None] = {"exception": None} def run_with_exception_capture(): try: diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index dcd196ebdd772..224b68412e60a 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -3,7 +3,7 @@ import warnings from collections.abc import Mapping -from typing import Literal, Optional +from typing import Literal import pytest from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy @@ -152,9 +152,9 @@ def audio_url(): def _assert_mm_data_is_image_input( - mm_data: Optional[MultiModalDataDict], + mm_data: MultiModalDataDict | None, image_count: int, - skipped_image_indices: Optional[list] = None, + skipped_image_indices: list | None = None, ) -> None: assert mm_data is not None assert set(mm_data.keys()) == {"image"} @@ -169,9 +169,9 @@ def _assert_mm_data_is_image_input( def _assert_mm_uuids( - mm_uuids: Optional[MultiModalUUIDDict], + mm_uuids: MultiModalUUIDDict | None, media_count: int, - expected_uuids: list[Optional[str]], + expected_uuids: list[str | None], modality: str = "image", ) -> None: if len(expected_uuids) > 0: @@ -193,9 +193,9 @@ MultiModalDataCounts = Mapping[ModalityType, int] def _assert_mm_data_inputs( - mm_data: Optional[MultiModalDataDict], + mm_data: MultiModalDataDict | None, data_count: MultiModalDataCounts, - skipped_media_indices: Optional[dict[str, list]] = None, # modality -> list[int] + skipped_media_indices: dict[str, list] | None = None, # modality -> list[int] ) -> None: assert mm_data is not None assert set(data_count.keys()) == (set(mm_data.keys())) diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/test_renderer.py index f93978c3e6e72..c811a6ba63cb5 100644 --- a/tests/entrypoints/test_renderer.py +++ b/tests/entrypoints/test_renderer.py @@ -3,7 +3,6 @@ import io from dataclasses import dataclass -from typing import Optional from unittest.mock import AsyncMock, MagicMock import pybase64 @@ -17,7 +16,7 @@ from vllm.inputs.data import is_embeds_prompt @dataclass class MockModelConfig: max_model_len: int = 100 - encoder_config: Optional[dict] = None + encoder_config: dict | None = None class MockTokenizerResult: diff --git a/tests/evals/gsm8k/gsm8k_eval.py b/tests/evals/gsm8k/gsm8k_eval.py index 9edec7a78ca23..c7799607912b6 100644 --- a/tests/evals/gsm8k/gsm8k_eval.py +++ b/tests/evals/gsm8k/gsm8k_eval.py @@ -12,7 +12,6 @@ import json import os import time from collections.abc import Generator -from typing import Optional, Union import aiohttp import numpy as np @@ -23,7 +22,7 @@ from tqdm.asyncio import tqdm INVALID = -9999999 -def download_and_cache_file(url: str, filename: Optional[str] = None) -> str: +def download_and_cache_file(url: str, filename: str | None = None) -> str: """Download and cache a file from a URL.""" if filename is None: filename = os.path.join("/tmp", url.split("/")[-1]) @@ -81,9 +80,9 @@ async def call_vllm_api( prompt: str, temperature: float, max_tokens: int, - stop: Optional[list[str]] = None, - url: Optional[str] = None, - seed: Optional[int] = None, + stop: list[str] | None = None, + url: str | None = None, + seed: int | None = None, ) -> str: """Call vLLM's OpenAI-compatible completions endpoint.""" data = { @@ -112,8 +111,8 @@ def evaluate_gsm8k( host: str = "http://127.0.0.1", port: int = 8000, temperature: float = 0.0, - seed: Optional[int] = 42, -) -> dict[str, Union[float, int]]: + seed: int | None = 42, +) -> dict[str, float | int]: """ Evaluate GSM8K accuracy using vLLM serve endpoint. diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py index 88b21a9b84d64..1dec46e33f22e 100644 --- a/tests/kernels/attention/test_aiter_flash_attn.py +++ b/tests/kernels/attention/test_aiter_flash_attn.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -27,8 +26,8 @@ def ref_paged_attn( kv_lens: list[int], block_tables: torch.Tensor, scale: float, - sliding_window: Optional[int] = None, - soft_cap: Optional[float] = None, + sliding_window: int | None = None, + soft_cap: float | None = None, ) -> torch.Tensor: num_seqs = len(query_lens) block_tables = block_tables.cpu().numpy() @@ -94,12 +93,12 @@ def test_varlen_with_paged_kv( seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], head_size: int, - sliding_window: Optional[int], + sliding_window: int | None, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, num_blocks: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, ) -> None: torch.set_default_device("cuda") current_platform.seed_everything(0) diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 16e544eb3cf9f..15cdb950a7db5 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random -from typing import Optional import pytest import torch @@ -50,7 +49,7 @@ def ref_masked_attention( key: torch.Tensor, value: torch.Tensor, scale: float, - attn_mask: Optional[torch.Tensor] = None, + attn_mask: torch.Tensor | None = None, ) -> torch.Tensor: attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float() if attn_mask is not None: @@ -69,7 +68,7 @@ def ref_single_query_cached_kv_attention( block_tables: torch.Tensor, seq_lens: torch.Tensor, scale: float, - alibi_slopes: Optional[torch.Tensor], + alibi_slopes: torch.Tensor | None, ) -> None: num_query_heads = query.shape[1] num_kv_heads = value_cache.shape[1] @@ -415,7 +414,7 @@ def ref_multi_query_kv_attention( key: torch.Tensor, value: torch.Tensor, scale: float, - alibi_bias: Optional[list[torch.Tensor]], + alibi_bias: list[torch.Tensor] | None, dtype: torch.dtype, ) -> torch.Tensor: num_seqs = len(cu_seq_lens) - 1 diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py index 58e8bd592ba43..4295f852f95bb 100755 --- a/tests/kernels/attention/test_cascade_flash_attn.py +++ b/tests/kernels/attention/test_cascade_flash_attn.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -85,7 +84,7 @@ def test_cascade( head_size: int, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, num_blocks: int, fa_version: int, ) -> None: diff --git a/tests/kernels/attention/test_cutlass_mla_decode.py b/tests/kernels/attention/test_cutlass_mla_decode.py index dad1510ce532b..a60f4e385a893 100644 --- a/tests/kernels/attention/test_cutlass_mla_decode.py +++ b/tests/kernels/attention/test_cutlass_mla_decode.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import random -from typing import Optional import pytest import torch @@ -17,7 +16,7 @@ def cal_diff( y: torch.Tensor, name: str, use_fp8: bool = False, - diff_threshold: Optional[float] = None, + diff_threshold: float | None = None, ) -> None: x, y = x.double(), y.double() cos_diff = 1 - 2 * (x * y).sum().item() / max((x * x + y * y).sum().item(), 1e-12) diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py index d39f0a593ed41..18995545552ea 100644 --- a/tests/kernels/attention/test_flash_attn.py +++ b/tests/kernels/attention/test_flash_attn.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -34,8 +33,8 @@ def ref_paged_attn( kv_lens: list[int], block_tables: torch.Tensor, scale: float, - sliding_window: Optional[int] = None, - soft_cap: Optional[float] = None, + sliding_window: int | None = None, + soft_cap: float | None = None, ) -> torch.Tensor: num_seqs = len(query_lens) block_tables = block_tables.cpu().numpy() @@ -103,11 +102,11 @@ def test_flash_attn_with_paged_kv( head_size: int, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, num_blocks: int, - sliding_window: Optional[int], + sliding_window: int | None, fa_version: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, ) -> None: torch.set_default_device("cuda") if not is_fa_version_supported(fa_version): @@ -221,13 +220,13 @@ def test_varlen_with_paged_kv( seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], head_size: int, - sliding_window: Optional[int], + sliding_window: int | None, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, num_blocks: int, fa_version: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, ) -> None: torch.set_default_device("cuda") if not is_fa_version_supported(fa_version): diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py index 52cd10fdc5be0..82ec2ef14e56c 100644 --- a/tests/kernels/attention/test_flashinfer.py +++ b/tests/kernels/attention/test_flashinfer.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import flashinfer import pytest @@ -26,8 +25,8 @@ def ref_paged_attn( kv_lens: list[int], block_tables: torch.Tensor, scale: float, - sliding_window: Optional[int] = None, - soft_cap: Optional[float] = None, + sliding_window: int | None = None, + soft_cap: float | None = None, ) -> torch.Tensor: num_seqs = len(query_lens) block_tables = block_tables.cpu().numpy() @@ -90,8 +89,8 @@ def test_flashinfer_decode_with_paged_kv( head_size: int, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], - sliding_window: Optional[int], + soft_cap: float | None, + sliding_window: int | None, ) -> None: torch.set_default_device("cuda") current_platform.seed_everything(0) @@ -185,8 +184,8 @@ def test_flashinfer_prefill_with_paged_kv( head_size: int, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], - sliding_window: Optional[int], + soft_cap: float | None, + sliding_window: int | None, ) -> None: torch.set_default_device("cuda") current_platform.seed_everything(0) @@ -288,7 +287,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( head_size: int, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, ) -> None: pytest.skip("TODO: fix the accuracy issue") torch.set_default_device("cuda") @@ -398,7 +397,7 @@ def test_flashinfer_decode_with_paged_fp8_kv( head_size: int, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, ) -> None: # test doesn't work for num_heads = (16,16) torch.set_default_device("cuda") diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 61157429ec9cc..00f06da5a47b4 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import flashinfer import pytest @@ -68,9 +67,7 @@ NUM_BLOCKS = 32768 # Large enough to test overflow in index calculation. @torch.inference_mode def test_flashinfer_trtllm_decode_with_baseline( dtype: torch.dtype, - quant_dtypes: tuple[ - Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] - ], + quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], batch_size: int, max_seq_lens: tuple[int, int], num_heads: tuple[int, int], @@ -78,7 +75,7 @@ def test_flashinfer_trtllm_decode_with_baseline( kv_layout: str, block_size: int, window_left: int, - soft_cap: Optional[float], + soft_cap: float | None, has_sinks: bool, ) -> None: torch.set_default_device("cuda") @@ -267,9 +264,7 @@ def test_flashinfer_trtllm_decode_with_baseline( @torch.inference_mode def test_flashinfer_trtllm_prefill_with_baseline( dtype: torch.dtype, - quant_dtypes: tuple[ - Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] - ], + quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], batch_size: int, max_seq_lens: tuple[int, int], num_heads: tuple[int, int], @@ -277,7 +272,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( kv_layout: str, block_size: int, window_left: int, - soft_cap: Optional[float], + soft_cap: float | None, has_sinks: bool, ) -> None: torch.set_default_device("cuda") diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py index eb9204dfaf158..9b084f2f660b2 100644 --- a/tests/kernels/attention/test_merge_attn_states.py +++ b/tests/kernels/attention/test_merge_attn_states.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -20,7 +19,7 @@ def merge_attn_states_torch( prefix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS] suffix_output: torch.Tensor, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] suffix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS] - output_lse: Optional[torch.Tensor] = None, # [NUM_HEADS, NUM_TOKENS] + output_lse: torch.Tensor | None = None, # [NUM_HEADS, NUM_TOKENS] ): p_lse = prefix_lse s_lse = suffix_lse diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index fba82cfdadbdf..bf4d2179af5f9 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -32,8 +31,8 @@ def ref_paged_attn( kv_lens: list[int], block_tables: torch.Tensor, scale: float, - sliding_window: Optional[int] = None, - soft_cap: Optional[float] = None, + sliding_window: int | None = None, + soft_cap: float | None = None, ) -> torch.Tensor: num_seqs = len(query_lens) block_tables = block_tables.cpu().numpy() @@ -98,12 +97,12 @@ def test_triton_unified_attn( seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], head_size: int, - sliding_window: Optional[int], + sliding_window: int | None, dtype: torch.dtype, block_size: int, - soft_cap: Optional[float], + soft_cap: float | None, num_blocks: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, ) -> None: torch.set_default_device("cuda") diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py index 52133ec53d1d7..418c700bbf003 100644 --- a/tests/kernels/core/test_fused_quant_layernorm.py +++ b/tests/kernels/core/test_fused_quant_layernorm.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional, Union import pytest import torch @@ -31,13 +30,13 @@ EPS = 1e-6 ## Helpers -def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: +def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor: return torch.as_tensor(x, dtype=torch.float32, device="cuda") def ref_rms_norm( - rms_norm_layer: RMSNorm, x: torch.Tensor, residual: Optional[torch.Tensor] -) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + rms_norm_layer: RMSNorm, x: torch.Tensor, residual: torch.Tensor | None +) -> tuple[torch.Tensor, torch.Tensor | None]: if residual is not None: residual = residual.clone() out, residual = rms_norm_layer.forward_native(x, residual) @@ -51,9 +50,9 @@ def ref_dynamic_per_token_quant( rms_norm_layer: RMSNorm, x: torch.Tensor, quant_dtype: torch.dtype, - residual: Optional[torch.Tensor], - scale_ub: Optional[torch.Tensor], -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + residual: torch.Tensor | None, + scale_ub: torch.Tensor | None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: if scale_ub is not None: assert quant_dtype == torch.float8_e4m3fn @@ -76,9 +75,9 @@ def ref_impl( rms_norm_layer: RMSNorm, x: torch.Tensor, quant_dtype: torch.dtype, - residual: Optional[torch.Tensor], - scale_ub: Optional[torch.Tensor], -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + residual: torch.Tensor | None, + scale_ub: torch.Tensor | None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: return ref_dynamic_per_token_quant( rms_norm_layer, x, quant_dtype, residual, scale_ub ) @@ -88,9 +87,9 @@ def ops_dynamic_per_token_quant( weight: torch.Tensor, x: torch.Tensor, quant_dtype: torch.dtype, - residual: Optional[torch.Tensor], - scale_ub: Optional[torch.Tensor], -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + residual: torch.Tensor | None, + scale_ub: torch.Tensor | None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: if residual is not None: residual = residual.clone() out, scales = ops.rms_norm_dynamic_per_token_quant( @@ -103,9 +102,9 @@ def ops_impl( weight: torch.Tensor, x: torch.Tensor, quant_dtype: torch.dtype, - residual: Optional[torch.Tensor], - scale_ub: Optional[torch.Tensor], -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + residual: torch.Tensor | None, + scale_ub: torch.Tensor | None, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual, scale_ub) diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index 799e0a3f2a2bd..e1ddc5de067bb 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable from itertools import product -from typing import Callable, Optional import pytest import torch @@ -68,7 +68,7 @@ def test_rotary_embedding( seq_len: int, num_heads: int, head_size: int, - rotary_dim: Optional[int], + rotary_dim: int | None, dtype: torch.dtype, seed: int, device: str, diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py index 0a292a3e2ae70..30c64e0bd72a7 100644 --- a/tests/kernels/core/test_rotary_embedding.py +++ b/tests/kernels/core/test_rotary_embedding.py @@ -4,8 +4,6 @@ Tests for miscellaneous utilities """ -from typing import Optional - import pytest import torch @@ -17,7 +15,7 @@ def rotary_embedding_opcheck( rot, positions: torch.Tensor, query: torch.Tensor, - key: Optional[torch.Tensor] = None, + key: torch.Tensor | None = None, ): cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype) diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py index fea6b94481b60..d9023490d7fc2 100644 --- a/tests/kernels/mamba/test_causal_conv1d.py +++ b/tests/kernels/mamba/test_causal_conv1d.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -19,11 +18,11 @@ from vllm.platforms import current_platform def causal_conv1d_ref( x: torch.Tensor, weight: torch.Tensor, - bias: Optional[torch.Tensor] = None, - initial_states: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, + initial_states: torch.Tensor | None = None, return_final_states: bool = False, - final_states_out: Optional[torch.Tensor] = None, - activation: Optional[str] = "silu", + final_states_out: torch.Tensor | None = None, + activation: str | None = "silu", ): """ x: (batch, dim, seqlen) @@ -117,12 +116,12 @@ def causal_conv1d_update_ref( def causal_conv1d_opcheck_fn( x: torch.Tensor, weight: torch.Tensor, - bias: Optional[torch.Tensor] = None, - cu_seq_len: Optional[torch.Tensor] = None, - cache_indices: Optional[torch.Tensor] = None, - has_initial_state: Optional[torch.Tensor] = None, - conv_states: Optional[torch.Tensor] = None, - activation: Optional[str] = "silu", + bias: torch.Tensor | None = None, + cu_seq_len: torch.Tensor | None = None, + cache_indices: torch.Tensor | None = None, + has_initial_state: torch.Tensor | None = None, + conv_states: torch.Tensor | None = None, + activation: str | None = "silu", pad_slot_id: int = PAD_SLOT_ID, ): """ diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index ff12d1fb9a805..94a305a063c3a 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Any, Optional, Union +from typing import Any import torch @@ -35,7 +35,7 @@ from .mk_objects import ( from .parallel_utils import ProcessGroupInfo -def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str: +def _describe_tensor(t: torch.Tensor | None, name: str) -> str: if t is None: return f"{name} : None" else: @@ -44,21 +44,21 @@ def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str: @dataclass class Config: - Ms: Union[list[int], int] + Ms: list[int] | int K: int N: int E: int - topks: Union[list[int], int] + topks: list[int] | int dtype: torch.dtype - quant_config: Optional[TestMoEQuantConfig] + quant_config: TestMoEQuantConfig | None prepare_finalize_type: mk.FusedMoEPrepareAndFinalize fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute - fused_moe_chunk_size: Optional[int] + fused_moe_chunk_size: int | None world_size: int - torch_trace_dir_path: Optional[str] = None + torch_trace_dir_path: str | None = None def __post_init__(self): if self.quant_config is None: @@ -93,7 +93,7 @@ class Config: return self.Ms @property - def quant_dtype(self) -> Union[torch.dtype, str, None]: + def quant_dtype(self) -> torch.dtype | str | None: assert self.quant_config is not None return self.quant_config.quant_dtype @@ -112,7 +112,7 @@ class Config: return self.quant_config.per_out_ch_quant @property - def quant_block_shape(self) -> Optional[list[int]]: + def quant_block_shape(self) -> list[int] | None: assert self.quant_config is not None return self.quant_config.block_shape @@ -209,7 +209,7 @@ class Config: info = prepare_finalize_info(self.prepare_finalize_type) return info.backend - def is_valid(self) -> tuple[bool, Optional[str]]: + def is_valid(self) -> tuple[bool, str | None]: # Check prepare-finalize and fused-experts compatibility if self.is_batched_prepare_finalize(): if not self.is_batched_fused_experts(): @@ -280,10 +280,10 @@ class Config: class WeightTensors: w1: torch.Tensor w2: torch.Tensor - w1_scale: Optional[torch.Tensor] - w2_scale: Optional[torch.Tensor] - w1_gs: Optional[torch.Tensor] = None - w2_gs: Optional[torch.Tensor] = None + w1_scale: torch.Tensor | None + w2_scale: torch.Tensor | None + w1_gs: torch.Tensor | None = None + w2_gs: torch.Tensor | None = None def describe(self): s = "" @@ -351,11 +351,11 @@ class WeightTensors: @dataclass class RankTensors: hidden_states: torch.Tensor - hidden_states_scale: Optional[torch.Tensor] + hidden_states_scale: torch.Tensor | None topk_weights: torch.Tensor topk_ids: torch.Tensor - expert_map: Optional[torch.Tensor] + expert_map: torch.Tensor | None def describe(self): s = "" @@ -370,7 +370,7 @@ class RankTensors: @staticmethod def make_hidden_states( config: Config, - ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, torch.Tensor | None]: """ Return hidden_states """ diff --git a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py index 7d555202afe6a..95db6327c4f10 100644 --- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py +++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py @@ -4,7 +4,6 @@ import copy from enum import Enum from itertools import product -from typing import Optional import torch from tqdm import tqdm @@ -82,7 +81,7 @@ def make_feature_matrix(csv_file_path: str): import pandas as pd def add_to_results( - config: Config, success: Result, results_df: Optional[pd.DataFrame] = None + config: Config, success: Result, results_df: pd.DataFrame | None = None ): config_dict = asdict(config) config_dict["prepare_finalize_type"] = config_dict[ @@ -121,7 +120,7 @@ def make_feature_matrix(csv_file_path: str): product(Ms, Ks, Ns, Es, TOPKs, DTYPEs, PF_TYPES, FE_TYPES, Q_TYPES) ) - results_df: Optional[pd.DataFrame] = None + results_df: pd.DataFrame | None = None for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm( combinations ): diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py index 174b2d1781ae0..aa41f89cae7dc 100644 --- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py +++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Optional, Union import torch @@ -43,25 +42,25 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe @dataclass class TestMoEQuantConfig: - quant_dtype: Union[torch.dtype, str, None] + quant_dtype: torch.dtype | str | None per_out_ch_quant: bool per_act_token_quant: bool - block_shape: Optional[list[int]] + block_shape: list[int] | None @dataclass class PrepareFinalizeInfo: activation_format: mk.FusedMoEActivationFormat - supported_dtypes: list[Union[torch.dtype, str]] + supported_dtypes: list[torch.dtype | str] blocked_quantization_support: bool - backend: Optional[str] + backend: str | None supports_apply_weight_on_input: bool = True @dataclass class ExpertInfo: activation_format: mk.FusedMoEActivationFormat - supported_dtypes: list[Union[torch.dtype, str]] + supported_dtypes: list[torch.dtype | str] blocked_quantization_support: bool supports_chunking: bool supports_expert_map: bool @@ -78,7 +77,7 @@ MK_FUSED_EXPERT_TYPES: list[mk.FusedMoEPermuteExpertsUnpermute] = [] standard_format = mk.FusedMoEActivationFormat.Standard batched_format = mk.FusedMoEActivationFormat.BatchedExperts -common_float_types: list[Union[torch.dtype, str]] = [ +common_float_types: list[torch.dtype | str] = [ torch.float8_e4m3fn, torch.bfloat16, torch.float16, @@ -92,9 +91,9 @@ fp8_types = [torch.float8_e4m3fn] def register_prepare_and_finalize( kind, activation_format: mk.FusedMoEActivationFormat, - supported_dtypes: list[Union[torch.dtype, str]], + supported_dtypes: list[torch.dtype | str], blocked_quantization_support: bool, - backend: Optional[str], + backend: str | None, force_multigpu: bool = False, supports_apply_weight_on_input: bool = True, ): @@ -121,7 +120,7 @@ def register_prepare_and_finalize( def register_experts( kind, activation_format: mk.FusedMoEActivationFormat, - supported_dtypes: list[Union[torch.dtype, str]], + supported_dtypes: list[torch.dtype | str], blocked_quantization_support: bool, supports_chunking: bool, supports_expert_map: bool, @@ -340,7 +339,7 @@ if cutlass_fp4_supported(): supports_expert_map=False, ) -MK_QUANT_CONFIGS: list[Optional[TestMoEQuantConfig]] = [ +MK_QUANT_CONFIGS: list[TestMoEQuantConfig | None] = [ None, # per-channel / per-column weights and per-tensor activations TestMoEQuantConfig( @@ -395,7 +394,7 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe(): def make_prepare_finalize( prepare_finalize_type: mk.FusedMoEPrepareAndFinalize, - backend: Optional[str], + backend: str | None, moe: FusedMoEConfig, quant_config: FusedMoEQuantConfig, ) -> mk.FusedMoEPrepareAndFinalize: diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py index 7802129d3d48f..4aad820635ad7 100644 --- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py +++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py @@ -3,11 +3,12 @@ import dataclasses import os import traceback -from typing import Any, Callable, Optional +from collections.abc import Callable +from typing import Any, Concatenate import torch from torch.multiprocessing import spawn # pyright: ignore[reportPrivateImportUsage] -from typing_extensions import Concatenate, ParamSpec +from typing_extensions import ParamSpec from vllm.config import VllmConfig, set_current_vllm_config from vllm.distributed import init_distributed_environment, initialize_model_parallel @@ -58,9 +59,9 @@ def _worker_parallel_launch( world_local_size: int, node_rank: int, init_method: str, - worker: Callable[Concatenate[ProcessGroupInfo, Optional[VllmConfig], Any, P], None], - vllm_config: Optional[VllmConfig], - env_dict: Optional[dict], + worker: Callable[Concatenate[ProcessGroupInfo, VllmConfig | None, Any, P], None], + vllm_config: VllmConfig | None, + env_dict: dict | None, *args: P.args, **kwargs: P.kwargs, ) -> None: diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py index 48e5c4659b49a..a3e264c5f5e28 100644 --- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py +++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py @@ -2,8 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy +from collections.abc import Callable from itertools import product -from typing import Any, Callable +from typing import Any import torch diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py index fb9e5df281f1d..d83b63e187c2f 100644 --- a/tests/kernels/moe/parallel_utils.py +++ b/tests/kernels/moe/parallel_utils.py @@ -7,12 +7,13 @@ DeepEP test utilities import dataclasses import os import traceback -from typing import Callable, Optional +from collections.abc import Callable +from typing import Concatenate import torch from torch.distributed import ProcessGroup from torch.multiprocessing import spawn # pyright: ignore[reportPrivateImportUsage] -from typing_extensions import Concatenate, ParamSpec +from typing_extensions import ParamSpec from vllm.utils import get_open_port, has_deep_ep @@ -126,8 +127,8 @@ def make_deepep_ht_a2a( pgi: ProcessGroupInfo, dp_size: int, ht_args: DeepEPHTArgs, - q_dtype: Optional[torch.dtype] = None, - block_shape: Optional[list[int]] = None, + q_dtype: torch.dtype | None = None, + block_shape: list[int] | None = None, ): import deep_ep @@ -153,8 +154,8 @@ def make_deepep_ll_a2a( pg: ProcessGroup, pgi: ProcessGroupInfo, deepep_ll_args: DeepEPLLArgs, - q_dtype: Optional[torch.dtype] = None, - block_shape: Optional[list[int]] = None, + q_dtype: torch.dtype | None = None, + block_shape: list[int] | None = None, ): import deep_ep @@ -185,10 +186,10 @@ def make_deepep_a2a( pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int, - deepep_ht_args: Optional[DeepEPHTArgs], - deepep_ll_args: Optional[DeepEPLLArgs], - q_dtype: Optional[torch.dtype] = None, - block_shape: Optional[list[int]] = None, + deepep_ht_args: DeepEPHTArgs | None, + deepep_ll_args: DeepEPLLArgs | None, + q_dtype: torch.dtype | None = None, + block_shape: list[int] | None = None, ): if deepep_ht_args is not None: assert deepep_ll_args is None diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 09cede3fbcc77..2dce099770f08 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Optional import pytest import torch @@ -55,7 +54,7 @@ vllm_config.scheduler_config.max_model_len = 8192 @dataclass class BatchedMMConfig: in_dtype: torch.dtype - quant_dtype: Optional[torch.dtype] + quant_dtype: torch.dtype | None out_dtype: torch.dtype num_experts: int max_tokens_per_expert: int @@ -115,7 +114,7 @@ def test_batched_mm( K: int, N: int, dtype: torch.dtype, - block_shape: Optional[list[int]], + block_shape: list[int] | None, per_act_token_quant: bool, ): current_platform.seed_everything(7) @@ -242,7 +241,7 @@ def test_fused_moe_batched_experts( topk: int, dtype: torch.dtype, per_act_token_quant: bool, - block_shape: Optional[list[int]], + block_shape: list[int] | None, input_scales: bool, ): current_platform.seed_everything(7) diff --git a/tests/kernels/moe/test_count_expert_num_tokens.py b/tests/kernels/moe/test_count_expert_num_tokens.py index 996a4538d1054..39138be83bccb 100644 --- a/tests/kernels/moe/test_count_expert_num_tokens.py +++ b/tests/kernels/moe/test_count_expert_num_tokens.py @@ -5,7 +5,6 @@ Tests compute_expert_num_tokens kernels """ import dataclasses -from typing import Optional import pytest import torch @@ -16,7 +15,7 @@ from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens @dataclasses.dataclass class TestTensors: topk_ids: torch.Tensor - expert_map: Optional[torch.Tensor] = None + expert_map: torch.Tensor | None = None def to_device(self, device: str): self.topk_ids = self.topk_ids.to(device=device) diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index b82cea61bd4ea..4330eda251f75 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -3,7 +3,6 @@ import copy import dataclasses from math import prod -from typing import Optional import pytest import torch @@ -85,16 +84,16 @@ class MOETensors: @dataclasses.dataclass class MOETensors8Bit(MOETensors): # quantized - a_q: Optional[torch.Tensor] = None # a -> a_q - w1_q: Optional[torch.Tensor] = None # w1 -> w1_q - w2_q: Optional[torch.Tensor] = None # w2 -> w2_q - a_scale: Optional[torch.Tensor] = None - w1_scale: Optional[torch.Tensor] = None - w2_scale: Optional[torch.Tensor] = None + a_q: torch.Tensor | None = None # a -> a_q + w1_q: torch.Tensor | None = None # w1 -> w1_q + w2_q: torch.Tensor | None = None # w2 -> w2_q + a_scale: torch.Tensor | None = None + w1_scale: torch.Tensor | None = None + w2_scale: torch.Tensor | None = None # dequantized - a_d: Optional[torch.Tensor] = None # a -> a_q -> a_d - w1_d: Optional[torch.Tensor] = None # w1 -> w1_q -> w1_d - w2_d: Optional[torch.Tensor] = None # w2 -> w2_q -> w2_d + a_d: torch.Tensor | None = None # a -> a_q -> a_d + w1_d: torch.Tensor | None = None # w1 -> w1_q -> w1_d + w2_d: torch.Tensor | None = None # w2 -> w2_q -> w2_d @staticmethod def make_moe_tensors_8bit( @@ -209,7 +208,7 @@ def run_8_bit( topk_ids: torch.Tensor, per_act_token: bool, per_out_ch: bool, - num_local_experts: Optional[int] = None, + num_local_experts: int | None = None, ) -> torch.Tensor: assert not any( [ @@ -280,7 +279,7 @@ def test_cutlass_moe_8_bit_no_graph( per_act_token: bool, per_out_ch: bool, monkeypatch, - ep_size: Optional[int] = None, + ep_size: int | None = None, ): current_platform.seed_everything(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index e68c5bfa5946f..65cd3e110a0fa 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -7,7 +7,6 @@ fp8 block-quantized case. """ import dataclasses -from typing import Optional import pytest import torch.distributed @@ -92,13 +91,13 @@ class TestConfig: block_size: list[int] # configs for testing low-latency kernels low_latency: bool - use_fp8_dispatch: Optional[bool] = False + use_fp8_dispatch: bool | None = False @dataclasses.dataclass class TestTensors: rank_tokens: torch.Tensor # all ranks make this many tokens - rank_token_scales: Optional[torch.Tensor] + rank_token_scales: torch.Tensor | None topk: torch.Tensor topk_weights: torch.Tensor config: TestConfig @@ -143,7 +142,7 @@ def make_ll_modular_kernel( max_tokens_per_rank: int, dp_size: int, hidden_size: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, test_config: TestConfig, quant_config: FusedMoEQuantConfig, ) -> FusedMoEModularKernel: @@ -179,7 +178,7 @@ def make_ht_modular_kernel( pgi: ProcessGroupInfo, dp_size: int, num_local_experts: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, test_config: TestConfig, quant_config: FusedMoEQuantConfig, ) -> FusedMoEModularKernel: @@ -249,8 +248,8 @@ def deepep_deepgemm_moe_impl( test_tensors: TestTensors, w1: torch.Tensor, w2: torch.Tensor, - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], + w1_scale: torch.Tensor | None, + w2_scale: torch.Tensor | None, ) -> torch.Tensor: test_config = test_tensors.config num_experts = test_config.num_experts diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index a1dabea1f0c7d..527c20fe6f80b 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -5,7 +5,6 @@ Test deepep dispatch-combine logic """ import dataclasses -from typing import Optional, Union import pytest import torch.distributed @@ -90,7 +89,7 @@ class TestConfig: @dataclasses.dataclass class TestTensors: rank_tokens: torch.Tensor # all ranks make this many tokens - rank_token_scales: Optional[torch.Tensor] + rank_token_scales: torch.Tensor | None topk: torch.Tensor topk_weights: torch.Tensor config: TestConfig @@ -128,12 +127,12 @@ def make_modular_kernel( dp_size: int, num_experts: int, num_local_experts: int, - q_dtype: Optional[torch.dtype], + q_dtype: torch.dtype | None, use_fp8_dispatch: bool, quant_config: FusedMoEQuantConfig, ) -> FusedMoEModularKernel: - ht_args: Optional[DeepEPHTArgs] = None - ll_args: Optional[DeepEPLLArgs] = None + ht_args: DeepEPHTArgs | None = None + ll_args: DeepEPLLArgs | None = None if low_latency_mode: ll_args = DeepEPLLArgs( @@ -148,16 +147,14 @@ def make_modular_kernel( ) ht_args = DeepEPHTArgs(num_local_experts=num_local_experts) - a2a: Union[DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize] = ( - make_deepep_a2a( - pg=pg, - pgi=pgi, - dp_size=dp_size, - q_dtype=q_dtype, - block_shape=None, - deepep_ht_args=ht_args, - deepep_ll_args=ll_args, - ) + a2a: DeepEPHTPrepareAndFinalize | DeepEPLLPrepareAndFinalize = make_deepep_a2a( + pg=pg, + pgi=pgi, + dp_size=dp_size, + q_dtype=q_dtype, + block_shape=None, + deepep_ht_args=ht_args, + deepep_ll_args=ll_args, ) num_dispatchers = pgi.world_size // dp_size @@ -184,8 +181,8 @@ def deep_ep_moe_impl( test_tensors: TestTensors, w1: torch.Tensor, w2: torch.Tensor, - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], + w1_scale: torch.Tensor | None, + w2_scale: torch.Tensor | None, num_experts: int, use_fp8_dispatch: bool, per_act_token_quant: bool, @@ -281,8 +278,8 @@ def torch_moe_impl( test_tensors: TestTensors, w1: torch.Tensor, w2: torch.Tensor, - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], + w1_scale: torch.Tensor | None, + w2_scale: torch.Tensor | None, using_fp8_dispatch: bool, per_act_token_quant: bool, ): @@ -340,8 +337,8 @@ def _deep_ep_moe( config: TestConfig, w1: torch.Tensor, w2: torch.Tensor, - w1_scale: Optional[torch.Tensor], - w2_scale: Optional[torch.Tensor], + w1_scale: torch.Tensor | None, + w2_scale: torch.Tensor | None, use_fp8_dispatch: bool, per_act_token_quant: bool, ): diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index b028e676f086f..a86185a2dc461 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -5,7 +5,7 @@ import copy import textwrap import traceback from itertools import product -from typing import Any, Optional +from typing import Any import pytest import torch @@ -245,10 +245,10 @@ def test_modular_kernel_combinations_multigpu( n: int, e: int, dtype: torch.dtype, - quant_config: Optional[TestMoEQuantConfig], + quant_config: TestMoEQuantConfig | None, prepare_finalize_type: mk.FusedMoEPrepareAndFinalize, fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute, - chunk_size: Optional[int], + chunk_size: int | None, world_size: int, pytestconfig, ): @@ -287,10 +287,10 @@ def test_modular_kernel_combinations_singlegpu( n: int, e: int, dtype: torch.dtype, - quant_config: Optional[TestMoEQuantConfig], + quant_config: TestMoEQuantConfig | None, prepare_finalize_type: mk.FusedMoEPrepareAndFinalize, fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute, - chunk_size: Optional[int], + chunk_size: int | None, world_size: int, pytestconfig, ): diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index f357d149bd071..6b391c173f0bc 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -6,7 +6,7 @@ Run `pytest tests/kernels/test_moe.py`. """ import functools -from typing import Callable, Optional, Union +from collections.abc import Callable import pytest import torch @@ -80,7 +80,7 @@ vllm_config.scheduler_config.max_model_len = 8192 def run_moe_test( - baseline: Union[Callable, torch.Tensor], + baseline: Callable | torch.Tensor, moe_fn: Callable, a: torch.Tensor, w1: torch.Tensor, @@ -88,7 +88,7 @@ def run_moe_test( score: torch.Tensor, topk: int, global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, + expert_map: torch.Tensor | None = None, padding: bool = False, use_compile: bool = False, use_cudagraph: bool = False, @@ -212,7 +212,7 @@ def test_fused_moe( score: torch.Tensor, topk: int, global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, + expert_map: torch.Tensor | None = None, ) -> torch.Tensor: topk_weights, topk_ids, _ = fused_topk(a, score, topk, False) return m_fused_moe_fn( diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py index f92526e749557..6f779c6950150 100644 --- a/tests/kernels/moe/test_moe_align_block_size.py +++ b/tests/kernels/moe/test_moe_align_block_size.py @@ -5,8 +5,6 @@ Run `pytest tests/kernels/moe/test_moe_align_block_size.py`. """ -from typing import Optional - import pytest import torch @@ -94,7 +92,7 @@ def torch_moe_align_block_size( topk_ids: torch.Tensor, block_size: int, num_experts: int, - expert_map: Optional[torch.Tensor] = None, + expert_map: torch.Tensor | None = None, pad_sorted_ids: bool = False, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index a6214437d404a..da9fe33a1c620 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -5,8 +5,6 @@ Run `pytest tests/kernels/test_moe_permute_unpermute.py`. """ -from typing import Optional - import numpy as np import pytest import torch @@ -34,8 +32,8 @@ def torch_permute( n_expert: int, n_local_expert: int, start_expert: int, - expert_map: Optional[torch.Tensor] = None, - align_block_size: Optional[int] = None, + expert_map: torch.Tensor | None = None, + align_block_size: int | None = None, fill_invalid_expert: int = -1, ) -> list[torch.Tensor]: n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1] @@ -210,7 +208,7 @@ def test_moe_permute_unpermute( n_expert: int, ep_size: int, dtype: torch.dtype, - align_block_size: Optional[int], + align_block_size: int | None, ): if not moe_permute_unpermute_supported(): pytest.skip("moe_permute_unpermute is not supported on this platform.") diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py index dceed34f35125..7a5d10a87b741 100644 --- a/tests/kernels/moe/test_ocp_mx_moe.py +++ b/tests/kernels/moe/test_ocp_mx_moe.py @@ -4,7 +4,6 @@ import importlib.metadata from dataclasses import dataclass from importlib.util import find_spec -from typing import Optional import pytest import torch @@ -103,7 +102,7 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase): assert output -def swiglu(x, alpha: float = 1.702, beta: float = 1.0, limit: Optional[float] = None): +def swiglu(x, alpha: float = 1.702, beta: float = 1.0, limit: float | None = None): # Note we add an extra bias of 1 to the linear layer x_glu, x_linear = torch.chunk(x, 2, dim=-1) if limit is not None: @@ -510,7 +509,7 @@ def test_trtllm_gen_mxfp4_fused_moe( hidden_size: int, alpha: float, beta: float, - limit: Optional[float], + limit: float | None, act_type: str, transpose_optimized: bool, ): @@ -660,7 +659,7 @@ def test_flashinfer_cutlass_mxfp4_fused_moe( hidden_size: int, alpha: float, beta: float, - limit: Optional[float], + limit: float | None, ): torch.manual_seed(42) device = "cuda:0" @@ -811,9 +810,9 @@ def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe( num_tokens: int, intermediate_size: int, hidden_size: int, - alpha: Optional[float], - beta: Optional[float], - limit: Optional[float], + alpha: float | None, + beta: float | None, + limit: float | None, ): torch.manual_seed(42) device = "cuda:0" diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index 4c7c6c6a4f529..ac7f3fc5e6f05 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -73,7 +72,7 @@ def pplx_cutlass_moe( out_dtype, per_act_token: bool, per_out_ch: bool, - group_name: Optional[str], + group_name: str | None, ): from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( PplxPrepareAndFinalize, diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index 223f095c0b553..e665c636fa265 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -9,7 +9,7 @@ import copy import itertools import textwrap import traceback -from typing import Callable, Optional, Union +from collections.abc import Callable import pytest import torch @@ -89,7 +89,7 @@ def torch_prepare( a: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, - max_num_tokens: Optional[int] = None, + max_num_tokens: int | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: assert topk_ids.dim() == 2 assert topk_ids.shape[0] == a.shape[0] @@ -214,10 +214,10 @@ def create_pplx_prepare_finalize( dp_size: int, world_size: int, in_dtype: torch.dtype, - quant_dtype: Optional[torch.dtype], - block_shape: Optional[list[int]], + quant_dtype: torch.dtype | None, + block_shape: list[int] | None, per_act_token_quant: bool, - group_name: Optional[str], + group_name: str | None, ): from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( PplxPrepareAndFinalize, @@ -274,18 +274,14 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor: return t[(r * chunk) : (r + 1) * chunk] -def maybe_chunk_by_rank( - t: Optional[torch.Tensor], r: int, w: int -) -> Optional[torch.Tensor]: +def maybe_chunk_by_rank(t: torch.Tensor | None, r: int, w: int) -> torch.Tensor | None: if t is not None: return chunk_by_rank(t, r, w) else: return t -def chunk_scales_by_rank( - t: Optional[torch.Tensor], r: int, w: int -) -> Optional[torch.Tensor]: +def chunk_scales_by_rank(t: torch.Tensor | None, r: int, w: int) -> torch.Tensor | None: if t is not None and t.numel() > 1: chunk = rank_chunk(t.shape[0], r, w) return t[(r * chunk) : (r + 1) * chunk] @@ -293,9 +289,7 @@ def chunk_scales_by_rank( return t -def chunk_scales( - t: Optional[torch.Tensor], start: int, end: int -) -> Optional[torch.Tensor]: +def chunk_scales(t: torch.Tensor | None, start: int, end: int) -> torch.Tensor | None: if t is not None and t.numel() > 1: return t[start:end] else: @@ -313,10 +307,10 @@ def pplx_prepare_finalize( topk_weight: torch.Tensor, topk_ids: torch.Tensor, num_experts: int, - quant_dtype: Optional[torch.dtype], - block_shape: Optional[list[int]], + quant_dtype: torch.dtype | None, + block_shape: list[int] | None, per_act_token_quant: bool, - group_name: Optional[str], + group_name: str | None, ) -> torch.Tensor: assert torch.cuda.current_device() == pgi.local_rank @@ -409,8 +403,8 @@ def _pplx_prepare_finalize( score: torch.Tensor, topk: torch.Tensor, num_experts: int, - quant_dtype: Optional[torch.dtype], - block_shape: Optional[list[int]], + quant_dtype: torch.dtype | None, + block_shape: list[int] | None, per_act_token_quant: bool, use_internode: bool, ): @@ -479,7 +473,7 @@ def test_pplx_prepare_finalize_slow( dtype: torch.dtype, world_dp_size: tuple[int, int], per_act_token_quant: bool, - block_shape: Optional[list[int]], + block_shape: list[int] | None, use_internode: bool, ): if dtype == torch.float8_e4m3fn: @@ -521,7 +515,7 @@ def test_pplx_prepare_finalize_slow( def pplx_moe( - group_name: Optional[str], + group_name: str | None, rank: int, world_size: int, dp_size: int, @@ -530,17 +524,17 @@ def pplx_moe( w2: torch.Tensor, topk_weight: torch.Tensor, topk_ids: torch.Tensor, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - quant_dtype: Optional[torch.dtype] = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + quant_dtype: torch.dtype | None = None, per_act_token_quant=False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, use_compile: bool = False, use_cudagraphs: bool = True, - shared_experts: Optional[torch.nn.Module] = None, -) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + shared_experts: torch.nn.Module | None = None, +) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: num_tokens, hidden_dim = a.shape num_experts = w1.shape[0] topk = topk_ids.shape[1] @@ -657,13 +651,13 @@ def _pplx_moe( score: torch.Tensor, topk: int, num_experts: int, - w1_s: Optional[torch.Tensor] = None, - w2_s: Optional[torch.Tensor] = None, - quant_dtype: Optional[torch.dtype] = None, + w1_s: torch.Tensor | None = None, + w2_s: torch.Tensor | None = None, + quant_dtype: torch.dtype | None = None, per_act_token_quant: bool = False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, use_internode: bool = False, - shared_experts: Optional[torch.nn.Module] = None, + shared_experts: torch.nn.Module | None = None, ): try: if use_internode: @@ -812,7 +806,7 @@ def test_pplx_moe_slow( dtype: torch.dtype, world_dp_size: tuple[int, int], per_act_token_quant: bool, - block_shape: Optional[list[int]], + block_shape: list[int] | None, use_internode: bool, ): current_platform.seed_everything(7) diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 9466dacb0c111..65ce4073ad5bc 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional, Union import torch @@ -27,13 +26,13 @@ def triton_moe( w2: torch.Tensor, topk_weight: torch.Tensor, topk_ids: torch.Tensor, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - quant_dtype: Optional[torch.dtype] = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + quant_dtype: torch.dtype | None = None, per_act_token_quant=False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, ) -> torch.Tensor: quant_config = FusedMoEQuantConfig.make( quant_dtype, @@ -54,13 +53,13 @@ def batched_moe( w2: torch.Tensor, topk_weight: torch.Tensor, topk_ids: torch.Tensor, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - quant_dtype: Optional[torch.dtype] = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + quant_dtype: torch.dtype | None = None, per_act_token_quant: bool = False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, ) -> torch.Tensor: max_num_tokens = round_up(a.shape[0], 64) @@ -94,13 +93,13 @@ def naive_batched_moe( w2: torch.Tensor, topk_weight: torch.Tensor, topk_ids: torch.Tensor, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - quant_dtype: Optional[torch.dtype] = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + quant_dtype: torch.dtype | None = None, per_act_token_quant: bool = False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, ) -> torch.Tensor: max_num_tokens = round_up(a.shape[0], 64) @@ -129,8 +128,8 @@ def naive_batched_moe( def chunk_scales( - scales: Optional[torch.Tensor], start: int, end: int -) -> Optional[torch.Tensor]: + scales: torch.Tensor | None, start: int, end: int +) -> torch.Tensor | None: if scales is not None: if scales.numel() == 1: return scales @@ -144,10 +143,10 @@ def make_quantized_test_activations( m: int, k: int, in_dtype: torch.dtype, - quant_dtype: Optional[torch.dtype] = None, - block_shape: Optional[list[int]] = None, + quant_dtype: torch.dtype | None = None, + block_shape: list[int] | None = None, per_act_token_quant: bool = False, -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: a = torch.randn((E, m, k), device="cuda", dtype=in_dtype) / 10 a_q = a a_scale = None @@ -172,11 +171,11 @@ def make_quantized_test_activations( def moe_quantize_weights( w: torch.Tensor, - w_s: Optional[torch.Tensor], - quant_dtype: Union[torch.dtype, str, None], + w_s: torch.Tensor | None, + quant_dtype: torch.dtype | str | None, per_token_quant: bool, - block_shape: Optional[list[int]], -) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + block_shape: list[int] | None, +) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]: assert ( quant_dtype == torch.float8_e4m3fn or quant_dtype == torch.int8 @@ -220,10 +219,10 @@ def make_test_weight( rows: int, cols: int, in_dtype: torch.dtype = torch.bfloat16, - quant_dtype: Union[torch.dtype, str, None] = None, - block_shape: Optional[list[int]] = None, + quant_dtype: torch.dtype | str | None = None, + block_shape: list[int] | None = None, per_out_ch_quant: bool = False, -) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None]: w_16 = torch.randn((e, rows, cols), device="cuda", dtype=in_dtype) / 15 w_gs = None @@ -262,12 +261,12 @@ def make_test_weights( n: int, k: int, in_dtype: torch.dtype = torch.bfloat16, - quant_dtype: Union[torch.dtype, str, None] = None, - block_shape: Optional[list[int]] = None, + quant_dtype: torch.dtype | str | None = None, + block_shape: list[int] | None = None, per_out_ch_quant: bool = False, ) -> tuple[ - tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]], - tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]], + tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None], + tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None], ]: return ( make_test_weight( @@ -295,9 +294,9 @@ def make_test_quant_config( n: int, k: int, in_dtype: torch.dtype, - quant_dtype: Union[torch.dtype, str, None] = None, + quant_dtype: torch.dtype | str | None = None, per_act_token_quant: bool = False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, ) -> tuple[torch.Tensor, torch.Tensor, FusedMoEQuantConfig]: (_, w1, w1_s, w1_gs), (_, w2, w2_s, w2_gs) = make_test_weights( e, @@ -310,8 +309,8 @@ def make_test_quant_config( ) # Hacky/trivial scales for nvfp4. - a1_gscale: Optional[torch.Tensor] = None - a2_gscale: Optional[torch.Tensor] = None + a1_gscale: torch.Tensor | None = None + a2_gscale: torch.Tensor | None = None if quant_dtype == "nvfp4": a1_gscale = torch.ones((e,), device="cuda", dtype=torch.float32) a2_gscale = torch.ones((e,), device="cuda", dtype=torch.float32) @@ -348,9 +347,9 @@ def fused_moe( score: torch.Tensor, topk: int, renormalize: bool = False, - quant_config: Optional[FusedMoEQuantConfig] = None, + quant_config: FusedMoEQuantConfig | None = None, global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, + expert_map: torch.Tensor | None = None, ) -> torch.Tensor: topk_weights, topk_ids, _ = fused_topk( hidden_states, score.float(), topk, renormalize @@ -378,7 +377,7 @@ class BaselineMM(torch.nn.Module): self.b = b.to(dtype=torch.float32) self.out_dtype = out_dtype - def forward(self, a: torch.Tensor) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + def forward(self, a: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor | None]: return torch.mm(a.to(dtype=torch.float32), self.b).to(self.out_dtype), None @@ -422,8 +421,8 @@ class RealMLP(torch.nn.Module): quant_config=None, reduce_results: bool = True, prefix: str = "", - w1_s: Optional[torch.Tensor] = None, - w2_s: Optional[torch.Tensor] = None, + w1_s: torch.Tensor | None = None, + w2_s: torch.Tensor | None = None, ) -> None: from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -481,7 +480,7 @@ def make_shared_experts( N: int, K: int, in_dtype: torch.dtype = torch.bfloat16, - quant_dtype: Union[torch.dtype, str, None] = None, + quant_dtype: torch.dtype | str | None = None, ) -> torch.nn.Module: from vllm.model_executor.layers.quantization.fp8 import Fp8Config diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index d892f2a5acc09..9d11a7ef64138 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional, Union import torch @@ -15,13 +14,13 @@ ROCM_FP8FNUZ_MAX = 224.0 FP8_DTYPE = current_platform.fp8_dtype() -def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: +def as_float32_tensor(x: float | torch.Tensor) -> torch.Tensor: return torch.as_tensor(x, dtype=torch.float32, device="cuda") def ref_dynamic_per_token_quant( - x: torch.tensor, quant_dtype: torch.dtype, scale_ub: Optional[torch.tensor] = None -) -> tuple[torch.tensor, torch.tensor]: + x: torch.Tensor, quant_dtype: torch.dtype, scale_ub: torch.Tensor | None = None +) -> tuple[torch.Tensor, torch.Tensor]: assert quant_dtype in [torch.int8, FP8_DTYPE] if scale_ub is not None: assert quant_dtype == FP8_DTYPE @@ -76,8 +75,8 @@ def ref_dynamic_per_token_quant( # ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant # kernel def ref_dynamic_per_tensor_fp8_quant( - x: torch.tensor, -) -> tuple[torch.tensor, torch.tensor]: + x: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: fp8_traits = torch.finfo(FP8_DTYPE) fp8_traits_max = ( ROCM_FP8FNUZ_MAX @@ -250,10 +249,10 @@ def per_block_cast_to_int8( def dequant( t: torch.Tensor, - scale: Optional[torch.Tensor], - block_shape: Optional[list[int]], + scale: torch.Tensor | None, + block_shape: list[int] | None, per_act_token_quant: bool, - out_dtype: Optional[torch.dtype] = torch.float32, + out_dtype: torch.dtype | None = torch.float32, ) -> torch.Tensor: if scale is not None: f32 = torch.float32 @@ -267,10 +266,10 @@ def dequant( def batched_dequant( t: torch.Tensor, - scale: Optional[torch.Tensor], - block_shape: Optional[list[int]], + scale: torch.Tensor | None, + block_shape: list[int] | None, per_act_token_quant: bool, - out_dtype: Optional[torch.dtype] = torch.float32, + out_dtype: torch.dtype | None = torch.float32, ) -> torch.Tensor: if scale is not None: assert t.shape[0] == scale.shape[0] @@ -289,9 +288,9 @@ def native_batched_masked_quant_matmul( B: torch.Tensor, C: torch.Tensor, num_expert_tokens: torch.Tensor, - A_scale: Optional[torch.Tensor] = None, - B_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None, + A_scale: torch.Tensor | None = None, + B_scale: torch.Tensor | None = None, + block_shape: list[int] | None = None, per_act_token_quant: bool = False, ) -> torch.Tensor: num_expert_tokens_cpu = num_expert_tokens.clone() diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py index a3d524fe90ed0..465e24fd7eb97 100644 --- a/tests/kernels/quantization/test_cutlass_w4a8.py +++ b/tests/kernels/quantization/test_cutlass_w4a8.py @@ -6,7 +6,6 @@ Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`. """ from dataclasses import dataclass -from typing import Optional import pytest import torch @@ -60,10 +59,10 @@ SCHEDULES = [ class TypeConfig: act_type: torch.dtype weight_type: ScalarType - output_type: Optional[torch.dtype] - group_scale_type: Optional[torch.dtype] - channel_scale_type: Optional[torch.dtype] - token_scale_type: Optional[torch.dtype] + output_type: torch.dtype | None + group_scale_type: torch.dtype | None + channel_scale_type: torch.dtype | None + token_scale_type: torch.dtype | None @dataclass @@ -80,7 +79,7 @@ class Tensors: # (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints, # Ch Scales Type, Tok Scales Type) TestTypeTuple = tuple[ - list[torch.dtype], ScalarType, Optional[torch.dtype], Optional[torch.dtype], bool + list[torch.dtype], ScalarType, torch.dtype | None, torch.dtype | None, bool ] TEST_TYPES = [ *( @@ -116,8 +115,8 @@ def cutlass_quantize_and_pack( atype: torch.dtype, w: torch.Tensor, wtype: ScalarType, - stype: Optional[torch.dtype], - group_size: Optional[int], + stype: torch.dtype | None, + group_size: int | None, zero_points: bool = False, ): assert wtype.is_integer(), "TODO: support floating point weights" @@ -143,7 +142,7 @@ def cutlass_quantize_and_pack( def create_test_tensors( - shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int] + shape: tuple[int, int, int], types: TypeConfig, group_size: int | None ) -> Tensors: m, n, k = shape @@ -185,8 +184,8 @@ def create_test_tensors( def mm_test_helper( types: TypeConfig, tensors: Tensors, - group_size: Optional[int] = None, - schedule: Optional[str] = None, + group_size: int | None = None, + schedule: str | None = None, ): # CUTLASS upstream uses fp8 with fastaccum as reference # https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu#L406 diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py index b32523bb85d9a..efa81de158d38 100644 --- a/tests/kernels/quantization/test_machete_mm.py +++ b/tests/kernels/quantization/test_machete_mm.py @@ -7,7 +7,6 @@ Run `pytest tests/kernels/quantization/test_machete_mm.py`. import math from dataclasses import dataclass, fields -from typing import Optional import pytest import torch @@ -50,11 +49,11 @@ MNK_SHAPES = [ class TypeConfig: act_type: torch.dtype weight_type: ScalarType - output_type: Optional[torch.dtype] - group_scale_type: Optional[torch.dtype] - group_zero_type: Optional[torch.dtype] - channel_scale_type: Optional[torch.dtype] - token_scale_type: Optional[torch.dtype] + output_type: torch.dtype | None + group_scale_type: torch.dtype | None + group_zero_type: torch.dtype | None + channel_scale_type: torch.dtype | None + token_scale_type: torch.dtype | None @dataclass @@ -63,10 +62,10 @@ class Tensors: a_ref: torch.Tensor a: torch.Tensor w_q: torch.Tensor - w_g_s: Optional[torch.Tensor] - w_g_zp: Optional[torch.Tensor] - w_ch_s: Optional[torch.Tensor] - w_tok_s: Optional[torch.Tensor] + w_g_s: torch.Tensor | None + w_g_zp: torch.Tensor | None + w_ch_s: torch.Tensor | None + w_tok_s: torch.Tensor | None # (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints, @@ -74,7 +73,7 @@ class Tensors: # NOTE: None "Scale Type" means the act type is floating point # None "Output Type" means the output type is the same as the act type TestTypeTuple = tuple[ - list[torch.dtype], ScalarType, Optional[torch.dtype], Optional[torch.dtype], bool + list[torch.dtype], ScalarType, torch.dtype | None, torch.dtype | None, bool ] TEST_TYPES = [ # GPTQ style @@ -139,11 +138,11 @@ def rand_data(shape, dtype=torch.float16, scale=1, offset=0): return torch.randint(-8, 7, shape, dtype=dtype, device="cuda") -def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor): +def maybe_convert_zeropoints(zps: torch.Tensor | None, s: torch.Tensor): return zps if zps is None else -1 * s * (zps.to(s.dtype)) -def group_size_valid(shape: tuple[int, int, int], group_size: Optional[int]) -> bool: +def group_size_valid(shape: tuple[int, int, int], group_size: int | None) -> bool: return group_size is None or group_size == -1 or shape[2] % group_size == 0 @@ -151,8 +150,8 @@ def machete_quantize_and_pack( atype: torch.dtype, w: torch.Tensor, wtype: ScalarType, - stype: Optional[torch.dtype], - group_size: Optional[int], + stype: torch.dtype | None, + group_size: int | None, zero_points: bool = False, ): assert wtype.is_integer(), "TODO: support floating point weights" @@ -178,8 +177,8 @@ def machete_quantize_and_pack( def create_test_tensors( shape: tuple[int, int, int], types: TypeConfig, - group_size: Optional[int], - subset_stride_factor: Optional[int] = None, + group_size: int | None, + subset_stride_factor: int | None = None, ) -> Tensors: m, n, k = shape factor = subset_stride_factor or 1 @@ -243,8 +242,8 @@ def create_test_tensors( def machete_mm_test_helper( types: TypeConfig, tensors: Tensors, - group_size: Optional[int] = None, - schedule: Optional[str] = None, + group_size: int | None = None, + schedule: str | None = None, ): output_ref = torch.matmul(tensors.a_ref, tensors.w_ref) output_ref_type = output_ref.dtype @@ -294,7 +293,7 @@ def machete_mm_test_helper( @pytest.mark.parametrize("shape", MNK_SHAPES, ids=lambda x: "x".join(str(v) for v in x)) @pytest.mark.parametrize("types", TEST_TYPES) def test_machete_all_schedules(shape, types: TypeConfig): - group_sizes: list[Optional[int]] = [] + group_sizes: list[int | None] = [] if types.group_scale_type is None: group_sizes = [None] else: @@ -323,7 +322,7 @@ def test_machete_all_schedules(shape, types: TypeConfig): @pytest.mark.parametrize("shape", MNK_SHAPES, ids=lambda x: "x".join(str(v) for v in x)) @pytest.mark.parametrize("types", TEST_TYPES) def test_machete_heuristic(shape, types: TypeConfig): - group_sizes: list[Optional[int]] = [] + group_sizes: list[int | None] = [] if types.group_scale_type is None: group_sizes = [None] else: diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py index 1026332d99f89..6633a8bbd3c60 100644 --- a/tests/kernels/quantization/test_triton_scaled_mm.py +++ b/tests/kernels/quantization/test_triton_scaled_mm.py @@ -6,7 +6,6 @@ Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`. """ import importlib -from typing import Optional import pytest import torch @@ -27,7 +26,7 @@ def torch_scaled_mm( scale_a: torch.Tensor, scale_b: torch.Tensor, out_dtype: type[torch.dtype], - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, ) -> torch.Tensor: out = torch.mm(a.to(torch.float32), b.to(torch.float32)) out = scale_a * out diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py index 9f78c177a81f0..c9eca1f86d3a1 100644 --- a/tests/kernels/test_onednn.py +++ b/tests/kernels/test_onednn.py @@ -2,8 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Integration tests for FlexAttention backend vs default backend""" -from typing import Optional - import pytest import torch @@ -38,8 +36,8 @@ def ref_int8_scaled_mm( b: torch.Tensor, scale_a: torch.Tensor, scale_b: torch.Tensor, - azp: Optional[torch.Tensor], - bias: Optional[torch.Tensor], + azp: torch.Tensor | None, + bias: torch.Tensor | None, output_type: torch.dtype, ): if azp is not None: diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 015424d9ee0f7..6c7ff984b4337 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -7,7 +7,7 @@ import random import unittest from collections.abc import Sequence from numbers import Number -from typing import Any, NamedTuple, Optional, Union +from typing import Any, NamedTuple import pytest import torch @@ -96,10 +96,10 @@ class PackedQKVInputs(NamedTuple): query: torch.Tensor key: torch.Tensor value: torch.Tensor - q_start_loc_list: Optional[list[int]] - kv_start_loc_list: Optional[list[int]] - q_seq_lens: Optional[list[int]] - kv_seq_lens: Optional[list[int]] + q_start_loc_list: list[int] | None + kv_start_loc_list: list[int] | None + q_seq_lens: list[int] | None + kv_seq_lens: list[int] | None class PackedQKVO(NamedTuple): @@ -115,7 +115,7 @@ class PackedQKVO(NamedTuple): x head_size) known-correct attention output """ - packed_qkv: Optional[PackedQKVInputs] + packed_qkv: PackedQKVInputs | None ideal_output: torch.Tensor @@ -149,12 +149,12 @@ class PhaseTestParameters(NamedTuple): """ packed_qkvo: PackedQKVO - kv_mmap: Optional[KVMemoryMap] + kv_mmap: KVMemoryMap | None def maybe_make_int_tensor( - _list: Optional[list[int]], - device: Union[torch.device, str], + _list: list[int] | None, + device: torch.device | str, ) -> torch.Tensor: """ Convert Python int list to a 1D int torch.Tensor on `device` @@ -170,8 +170,8 @@ def maybe_make_int_tensor( def maybe_make_long_tensor( - _list: Optional[list[int]], - device: Union[torch.device, str], + _list: list[int] | None, + device: torch.device | str, ) -> torch.Tensor: """ Convert Python int list to a 1D long torch.Tensor on `device` @@ -186,7 +186,7 @@ def maybe_make_long_tensor( ) -def maybe_max(_list: Optional[list]) -> Optional[Number]: +def maybe_max(_list: list | None) -> Number | None: """ Returns: @@ -241,9 +241,9 @@ def ref_masked_attention( key: torch.Tensor, value: torch.Tensor, scale: float, - custom_mask: Optional[torch.Tensor] = None, - q_seq_lens: Optional[list] = None, - kv_seq_lens: Optional[list] = None, + custom_mask: torch.Tensor | None = None, + q_seq_lens: list | None = None, + kv_seq_lens: list | None = None, ) -> torch.Tensor: """ "Golden" masked attention reference. Supports two types of masking: @@ -302,11 +302,11 @@ def ref_masked_attention( def make_qkv( batch_size: int, max_q_seq_len: int, - max_kv_seq_len: Optional[int], + max_kv_seq_len: int | None, num_heads: int, head_size: int, - device: Union[torch.device, str], - force_kv_seq_lens: Optional[list[int]] = None, + device: torch.device | str, + force_kv_seq_lens: list[int] | None = None, attn_type: AttentionType = AttentionType.ENCODER_DECODER, force_max_len: bool = False, ) -> tuple[QKVInputs, QKVInputs, QKVInputs]: @@ -436,7 +436,7 @@ def make_qkv( def pack_tensor( - unpacked_tensor: torch.Tensor, seq_lens: list[int], device: Union[torch.device, str] + unpacked_tensor: torch.Tensor, seq_lens: list[int], device: torch.device | str ) -> tuple[torch.Tensor, list[int]]: """ Pack a batch_size x padded_seq_len x num_heads x head_size tensor into an @@ -470,7 +470,7 @@ def pack_tensor( return packed_tensor, start_loc_list -def pack_qkv(qkv: QKVInputs, device: Union[torch.device, str]) -> PackedQKVInputs: +def pack_qkv(qkv: QKVInputs, device: torch.device | str) -> PackedQKVInputs: """ Individually pack each of Q, K and V, each with dimensions batch_size x padded_seq_len x num_heads x head_size, into respective number_of_tokens x @@ -594,19 +594,19 @@ def make_alibi_bias( def _make_metadata_tensors( - seq_lens: Optional[list[int]], - context_lens: Optional[list[int]], - encoder_seq_lens: Optional[list[int]], - device: Union[torch.device, str], + seq_lens: list[int] | None, + context_lens: list[int] | None, + encoder_seq_lens: list[int] | None, + device: torch.device | str, ) -> tuple[ torch.Tensor, torch.Tensor, Any, Any, - Optional[torch.Tensor], + torch.Tensor | None, torch.Tensor, torch.Tensor, - Optional[int], + int | None, ]: """ Build scalar & tensor values required to build attention metadata structure. @@ -678,7 +678,7 @@ def make_kv_cache( num_heads: int, head_size: int, block_size: int, - device: Union[torch.device, str], + device: torch.device | str, backend: str, default_val: float = 0.0, ) -> torch.Tensor: @@ -726,18 +726,18 @@ def _num_tokens_to_min_blocks(num_tokens: int, block_size: int) -> int: return (num_tokens + block_size) // block_size -def make_empty_slot_mapping_tensor(device: Union[torch.device, str]): +def make_empty_slot_mapping_tensor(device: torch.device | str): return maybe_make_long_tensor([], device) -def make_empty_block_tables_tensor(device: Union[torch.device, str]): +def make_empty_block_tables_tensor(device: torch.device | str): return torch.tensor([], device=device) def split_slot_mapping( slot_mapping_list: torch.Tensor, seq_lens: list[int], - device: Union[torch.device, str], + device: torch.device | str, ): """ Split a slot mapping into valid prefill- and decode-phase slot mappings. @@ -799,7 +799,7 @@ def split_slot_mapping( def make_block_tables_slot_mapping( block_size: int, seq_lens: list[int], - device: Union[torch.device, str], + device: torch.device | str, block_base_addr: int = 0, ) -> tuple[torch.Tensor, list[int], int]: """ @@ -880,11 +880,11 @@ def make_block_tables_slot_mapping( def make_test_metadata( attn_backend: _Backend, is_prompt: bool, - seq_lens: Optional[list[int]], - decoder_test_params: Optional[PhaseTestParameters], - device: Union[torch.device, str], - encoder_test_params: Optional[PhaseTestParameters] = None, - cross_test_params: Optional[PhaseTestParameters] = None, + seq_lens: list[int] | None, + decoder_test_params: PhaseTestParameters | None, + device: torch.device | str, + encoder_test_params: PhaseTestParameters | None = None, + cross_test_params: PhaseTestParameters | None = None, ) -> AttentionMetadata: """ Construct fake attention metadata for a given test phase @@ -1142,16 +1142,16 @@ def torch_experts( topk_weight: torch.Tensor, topk_ids: torch.Tensor, global_num_experts: int = -1, - b_bias1: Optional[torch.Tensor] = None, - b_bias2: Optional[torch.Tensor] = None, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - quant_dtype: Optional[torch.dtype] = None, + b_bias1: torch.Tensor | None = None, + b_bias2: torch.Tensor | None = None, + expert_map: torch.Tensor | None = None, + w1_scale: torch.Tensor | None = None, + w2_scale: torch.Tensor | None = None, + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + quant_dtype: torch.dtype | None = None, per_act_token_quant=False, - block_shape: Optional[list[int]] = None, + block_shape: list[int] | None = None, apply_router_weights_on_input: bool = False, ) -> torch.Tensor: assert ( @@ -1261,10 +1261,10 @@ def torch_moe( w2: torch.Tensor, score: torch.Tensor, topk: int, - b_bias1: Optional[torch.Tensor] = None, - b_bias2: Optional[torch.Tensor] = None, + b_bias1: torch.Tensor | None = None, + b_bias2: torch.Tensor | None = None, global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, + expert_map: torch.Tensor | None = None, ) -> torch.Tensor: score = torch.softmax(score, dim=-1, dtype=torch.float32) topk_weight, topk_ids = torch.topk(score, topk) @@ -1298,15 +1298,13 @@ def torch_moe_single(a, w, score, topk): # A special version of op check that has a restricted default set of test_utils # and a patched version of allclose that supports fp8 types. def opcheck( - op: Union[ - torch._ops.OpOverload, - torch._ops.OpOverloadPacket, - torch._library.custom_ops.CustomOpDef, - ], + op: torch._ops.OpOverload + | torch._ops.OpOverloadPacket + | torch._library.custom_ops.CustomOpDef, args: tuple[Any, ...], - kwargs: Optional[dict[str, Any]] = None, + kwargs: dict[str, Any] | None = None, *, - test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS, + test_utils: str | Sequence[str] = ALL_OPCHECK_TEST_UTILS, raise_exception: bool = True, cond: bool = True, ) -> dict[str, str]: @@ -1338,7 +1336,7 @@ def baseline_scaled_mm( scale_a: torch.Tensor, scale_b: torch.Tensor, out_dtype: type[torch.dtype], - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, ) -> torch.Tensor: # We treat N-dimensional group scaling as extended numpy-style broadcasting # in numpy simply stretches dimensions with an extent of 1 to match diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 695e06e7c1d63..8f18f01441932 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -4,7 +4,6 @@ import random from copy import deepcopy from dataclasses import dataclass -from typing import Optional from unittest.mock import patch import pytest @@ -106,7 +105,7 @@ def skip_cuda_with_stage_false(request): def get_random_id_to_index( num_loras: int, num_slots: int, log: bool = True -) -> list[Optional[int]]: +) -> list[int | None]: """Creates a random lora_id_to_index mapping. Args: @@ -122,7 +121,7 @@ def get_random_id_to_index( "num_loras must be less than or equal to num_slots." ) - slots: list[Optional[int]] = [None] * num_slots + slots: list[int | None] = [None] * num_slots random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist() for lora_id, slot_idx in enumerate(random_slot_selections, start=1): slots[slot_idx] = lora_id @@ -134,7 +133,7 @@ def get_random_id_to_index( def populate_loras( - id_to_index: list[Optional[int]], + id_to_index: list[int | None], layer: BaseLayerWithLoRA, layer_weights: torch.Tensor, generate_embeddings_tensor: int = 0, diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 0d9431bd7aaea..50fd63d35cded 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import sys -from typing import Union import vllm from vllm import LLM @@ -27,7 +26,7 @@ def do_sample( llm: vllm.LLM, lora_path: str, lora_id: int, - tensorizer_config_dict: Union[dict, None] = None, + tensorizer_config_dict: dict | None = None, ) -> list[str]: prompts = [ "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 @@ -73,9 +72,7 @@ def do_sample( return generated_texts -def generate_and_test( - llm, sql_lora_files, tensorizer_config_dict: Union[dict, None] = None -): +def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = None): print("lora adapter created") print("lora 1") assert ( diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 894263bd0ba38..1800ca107a426 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Optional import pytest @@ -20,7 +19,7 @@ class TestConfig: max_loras: int = 2 max_lora_rank: int = 16 max_model_len: int = 4096 - mm_processor_kwargs: Optional[dict[str, int]] = None + mm_processor_kwargs: dict[str, int] | None = None def __post_init__(self): if self.mm_processor_kwargs is None: @@ -61,7 +60,7 @@ class Qwen2VLTester: self, images: list[ImageAsset], expected_outputs: list[str], - lora_id: Optional[int] = None, + lora_id: int | None = None, temperature: float = 0, max_tokens: int = 5, ): @@ -92,7 +91,7 @@ class Qwen2VLTester: self, images: list[ImageAsset], expected_outputs: list[list[str]], - lora_id: Optional[int] = None, + lora_id: int | None = None, temperature: float = 0, beam_width: int = 2, max_tokens: int = 5, diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py index c70e58a375c78..9b5dedc4327fb 100644 --- a/tests/lora/test_resolver.py +++ b/tests/lora/test_resolver.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest @@ -14,7 +13,7 @@ class DummyLoRAResolver(LoRAResolver): async def resolve_lora( self, base_model_name: str, lora_name: str - ) -> Optional[LoRARequest]: + ) -> LoRARequest | None: if lora_name == "test_lora": return LoRARequest( lora_name=lora_name, diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index c861a52d68721..eb026c2ec0209 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import OrderedDict -from typing import NamedTuple, Optional +from typing import NamedTuple from unittest.mock import patch import pytest @@ -21,7 +21,7 @@ class LoRANameParserTestConfig(NamedTuple): name: str module_name: str is_lora_a: bool - weights_mapper: Optional[WeightsMapper] = None + weights_mapper: WeightsMapper | None = None def test_parse_fine_tuned_lora_name_valid(): diff --git a/tests/lora/utils.py b/tests/lora/utils.py index b522aa6b08743..d30b77f094665 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -4,7 +4,6 @@ import json import os from dataclasses import dataclass -from typing import Optional, Union import torch from safetensors.torch import save_file @@ -81,7 +80,7 @@ class DummyLoRAManager: module_name: str, input_dim: int, output_dims: list[int], - noop_lora_index: Optional[list[int]] = None, + noop_lora_index: list[int] | None = None, rank: int = 8, ): base_loras: list[LoRALayerWeights] = [] @@ -113,7 +112,7 @@ def assert_close(a, b): @dataclass class PunicaTensors: inputs_tensor: torch.Tensor - lora_weights: Union[torch.Tensor, list[torch.Tensor]] + lora_weights: torch.Tensor | list[torch.Tensor] our_out_tensor: torch.Tensor ref_out_tensor: torch.Tensor b_seq_start_loc: torch.Tensor diff --git a/tests/model_executor/model_loader/tensorizer_loader/conftest.py b/tests/model_executor/model_loader/tensorizer_loader/conftest.py index add6d3742ff53..74724a3b398dd 100644 --- a/tests/model_executor/model_loader/tensorizer_loader/conftest.py +++ b/tests/model_executor/model_loader/tensorizer_loader/conftest.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Callable +from collections.abc import Callable import pytest diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 12aad4cb8da0f..bf290079469aa 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -76,7 +75,7 @@ class Relu3(ReLUSquaredActivation): ], ) def test_enabled_ops( - env: Optional[str], + env: str | None, torch_level: int, use_inductor: bool, ops_enabled: list[int], diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index b161cc7153b8f..ad37d1ad82c03 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest import torch @@ -138,7 +137,7 @@ def test_models( example_prompts, max_tokens, num_logprobs ) - prompt_embeds: Optional[list[torch.Tensor]] = [] if use_prompt_embeds else None + prompt_embeds: list[torch.Tensor] | None = [] if use_prompt_embeds else None prompt_token_ids = [] for prompt in example_prompts: diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index abedd15b0d7eb..fd2df329f17f9 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Callable +from collections.abc import Callable import pytest diff --git a/tests/models/language/generation_ppl_test/ppl_utils.py b/tests/models/language/generation_ppl_test/ppl_utils.py index 43f6066b1c85e..cfa09635effc1 100644 --- a/tests/models/language/generation_ppl_test/ppl_utils.py +++ b/tests/models/language/generation_ppl_test/ppl_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://huggingface.co/docs/transformers/perplexity -from typing import Optional, cast +from typing import cast import pytest import torch @@ -85,7 +85,7 @@ def wikitext_ppl_test( n_tokens = 0 for output in outputs: output = cast(TokensTextLogprobsPromptLogprobs, output) - token_datas = cast(list[Optional[dict[int, Logprob]]], output[3]) + token_datas = cast(list[dict[int, Logprob] | None], output[3]) assert token_datas[0] is None token_log_probs = [] diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py index 261ab80ae86bc..4ac40656bc62a 100644 --- a/tests/models/language/pooling/embed_utils.py +++ b/tests/models/language/pooling/embed_utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence -from typing import Optional import pytest @@ -13,7 +12,7 @@ def run_embedding_correctness_test( hf_model: "HfRunner", inputs: list[str], vllm_outputs: Sequence[list[float]], - dimensions: Optional[int] = None, + dimensions: int | None = None, ): hf_outputs = hf_model.encode(inputs) if dimensions: diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index c9574dca498ee..c8deffbf66dba 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest @@ -66,7 +65,7 @@ def test_models( pooling_type="MEAN", normalize=False ) - max_model_len: Optional[int] = 512 + max_model_len: int | None = 512 if model in [ "sentence-transformers/all-MiniLM-L12-v2", "sentence-transformers/stsb-roberta-base-v2", diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index 14308ac06c03e..0adc9b5cf25f6 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from __future__ import annotations - import numpy as np import openai import pytest diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py index 65ad49fad3653..f2a8177377491 100644 --- a/tests/models/language/pooling_mteb_test/mteb_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_utils.py @@ -3,7 +3,6 @@ import tempfile from collections.abc import Sequence -from typing import Optional import mteb import numpy as np @@ -51,7 +50,7 @@ class VllmMtebEncoder(mteb.Encoder): def predict( self, - sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt + sentences: list[tuple[str, str, str | None]], # query, corpus, prompt *args, **kwargs, ) -> np.ndarray: @@ -100,7 +99,7 @@ class ScoreClientMtebEncoder(mteb.Encoder): def predict( self, - sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt + sentences: list[tuple[str, str, str | None]], # query, corpus, prompt *args, **kwargs, ) -> np.ndarray: @@ -294,7 +293,7 @@ def mteb_test_rerank_models_hf( original_predict = hf_model.predict def _predict( - sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt + sentences: list[tuple[str, str, str | None]], # query, corpus, prompt *args, **kwargs, ): diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py index 9e95dd74c3978..2927a37111364 100644 --- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Any import numpy as np import pytest @@ -111,7 +111,7 @@ class GemmaMtebEncoder(VllmMtebEncoder): def predict( self, - sentences: list[tuple[str, str, Optional[str]]], # query, corpus, prompt + sentences: list[tuple[str, str, str | None]], # query, corpus, prompt *args, **kwargs, ) -> np.ndarray: diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index ef08b1916aa5f..e39dfc888779e 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence -from typing import Optional import pytest from transformers import AutoModelForSpeechSeq2Seq @@ -18,8 +17,8 @@ HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: def vllm_to_hf_output( - vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], -) -> tuple[list[int], str, Optional[SampleLogprobs]]: + vllm_output: tuple[list[int], str, SampleLogprobs | None], +) -> tuple[list[int], str, SampleLogprobs | None]: """Sanitize hf output to be comparable with vllm output.""" output_ids, output_str, out_logprobs = vllm_output @@ -46,7 +45,7 @@ def run_test( max_tokens: int, num_logprobs: int, tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + distributed_executor_backend: str | None = None, ): """Inference result should be the same between hf and vllm. diff --git a/tests/models/multimodal/generation/test_phi4_multimodal.py b/tests/models/multimodal/generation/test_phi4_multimodal.py index 132c69285c5c7..cbc7dfca0234d 100644 --- a/tests/models/multimodal/generation/test_phi4_multimodal.py +++ b/tests/models/multimodal/generation/test_phi4_multimodal.py @@ -3,7 +3,6 @@ import os from collections.abc import Sequence -from typing import Optional import librosa import pytest @@ -57,7 +56,7 @@ if current_platform.is_rocm(): def run_test( hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]], + inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]], model: str, *, max_model_len: int, @@ -66,7 +65,7 @@ def run_test( num_logprobs: int, mm_limit: int, tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + distributed_executor_backend: str | None = None, ): """Inference result should be the same between hf and vllm. diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index e69d44c6a1319..5619cecc081d2 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -3,7 +3,6 @@ import os from collections.abc import Sequence -from typing import Optional import librosa import pytest @@ -48,7 +47,7 @@ models = [model_path] def vllm_to_hf_output( - vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str + vllm_output: tuple[list[int], str, SampleLogprobs | None], model: str ): """Sanitize vllm output to be comparable with hf output.""" _, output_str, out_logprobs = vllm_output @@ -79,7 +78,7 @@ if current_platform.is_rocm(): def run_test( hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]], + inputs: Sequence[tuple[list[str], PromptImageInput, PromptAudioInput | None]], model: str, *, max_model_len: int, @@ -88,7 +87,7 @@ def run_test( num_logprobs: int, mm_limit: int, tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + distributed_executor_backend: str | None = None, ): """Inference result should be the same between hf and vllm. diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index bde07da9101ac..3cad2c43d5623 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from dataclasses import asdict -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import pytest from mistral_common.multimodal import download_image @@ -117,7 +117,7 @@ FIXTURE_LOGPROBS_CHAT = { MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json", } -OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]] +OutputsLogprobs = list[tuple[list[int], str, SampleLogprobs | None]] # For the test author to store golden output in JSON diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index a8f0ba8701850..a4abf6e405f74 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional, TypedDict, Union +from typing import Any, TypedDict import numpy.typing as npt import pytest @@ -83,7 +83,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict): def batch_make_image_embeddings( - image_batches: list[Union[Image.Image, list[Image.Image]]], + image_batches: list[Image.Image | list[Image.Image]], processor, llm: VllmRunner, ) -> list[Qwen2VLPromptImageEmbeddingInput]: @@ -272,7 +272,7 @@ def run_embedding_input_test( num_logprobs: int, mm_limit: int, tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + distributed_executor_backend: str | None = None, ): """Inference result should be the same between original image/video input and image/video embeddings input. diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index 766f09b0d3207..eca2b61e37d53 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional import pytest @@ -92,7 +91,7 @@ def run_test( model: str, *, tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + distributed_executor_backend: str | None = None, ) -> None: prompt_list = PROMPTS * 10 expected_list = EXPECTED[model] * 10 diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py index 096931cca09f7..6252f33bdfad7 100644 --- a/tests/models/multimodal/generation/vlm_utils/builders.py +++ b/tests/models/multimodal/generation/vlm_utils/builders.py @@ -2,9 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Helpers for building inputs that can be leveraged for different test types.""" -from collections.abc import Iterable +from collections.abc import Callable, Iterable from pathlib import PosixPath -from typing import Callable, Optional, Union import torch @@ -47,9 +46,9 @@ def replace_test_placeholder( def get_model_prompts( base_prompts: Iterable[str], - img_idx_to_prompt: Optional[Callable[[int], str]], - video_idx_to_prompt: Optional[Callable[[int], str]], - audio_idx_to_prompt: Optional[Callable[[int], str]], + img_idx_to_prompt: Callable[[int], str] | None, + video_idx_to_prompt: Callable[[int], str] | None, + audio_idx_to_prompt: Callable[[int], str] | None, prompt_formatter: Callable[[str], str], ) -> list[str]: """Given a model-agnostic base prompt and test configuration for a model(s) @@ -93,7 +92,7 @@ def build_single_image_inputs_from_test_info( test_info: VLMTestInfo, image_assets: ImageTestAssets, size_wrapper: ImageSizeWrapper, - tmp_path: Optional[PosixPath] = None, + tmp_path: PosixPath | None = None, ) -> list[PromptWithMultiModalInput]: if test_info.prompt_formatter is None: raise ValueError("Prompt formatter must be set to build single image inputs") @@ -147,7 +146,7 @@ def build_multi_image_inputs_from_test_info( test_info: VLMTestInfo, image_assets: ImageTestAssets, size_wrapper: ImageSizeWrapper, - tmp_path: Optional[PosixPath] = None, + tmp_path: PosixPath | None = None, ) -> list[PromptWithMultiModalInput]: if test_info.prompt_formatter is None: raise ValueError("Prompt formatter must be set to build multi image inputs") @@ -266,9 +265,7 @@ def build_video_inputs_from_test_info( ] -def apply_image_size_scaling( - image, size: Union[float, tuple[int, int]], size_type: SizeType -): +def apply_image_size_scaling(image, size: float | tuple[int, int], size_type: SizeType): """Applies a size scaler to one image; this can be an image size factor, which scales the image while maintaining the aspect ratio""" # Special case for embeddings; if it's a tensor, it's only valid if we diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index 5748ccc14c294..8d0e9b3eee9fd 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -2,7 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Core test implementation to be shared across modalities.""" -from typing import Any, Callable, Optional +from collections.abc import Callable +from typing import Any import torch from transformers.models.auto.auto_factory import _BaseAutoModelClass @@ -27,21 +28,21 @@ def run_test( enforce_eager: bool, max_model_len: int, max_num_seqs: int, - hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]], - vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]], + hf_output_post_proc: Callable[[RunnerOutput, str], Any] | None, + vllm_output_post_proc: Callable[[RunnerOutput, str], Any] | None, auto_cls: type[_BaseAutoModelClass], use_tokenizer_eos: bool, comparator: Callable[..., None], - get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]], - stop_str: Optional[list[str]], + get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None, + stop_str: list[str] | None, limit_mm_per_prompt: dict[str, int], - vllm_runner_kwargs: Optional[dict[str, Any]], - hf_model_kwargs: Optional[dict[str, Any]], - patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]], + vllm_runner_kwargs: dict[str, Any] | None, + hf_model_kwargs: dict[str, Any] | None, + patch_hf_runner: Callable[[HfRunner], HfRunner] | None, runner: RunnerOption = "auto", - distributed_executor_backend: Optional[str] = None, + distributed_executor_backend: str | None = None, tensor_parallel_size: int = 1, - vllm_embeddings: Optional[torch.Tensor] = None, + vllm_embeddings: torch.Tensor | None = None, ): """Modality agnostic test executor for comparing HF/vLLM outputs.""" # In the case of embeddings, vLLM takes separate input tensors diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py index 8f2f8bba39ca2..8c9c390911bdc 100644 --- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py +++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Custom input builders for edge-cases in different models.""" -from typing import Callable +from collections.abc import Callable from vllm.assets.image import ImageAsset from vllm.multimodal.image import rescale_image_size diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index e51d895772c05..d9c1d53b61c28 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -7,7 +7,6 @@ typically specific to a small subset of models. import types from pathlib import PosixPath -from typing import Optional, Union import numpy as np import numpy.typing as npt @@ -58,7 +57,7 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutpu def qwen_vllm_to_hf_output( vllm_output: RunnerOutput, model: str -) -> tuple[list[int], str, Optional[SampleLogprobs]]: +) -> tuple[list[int], str, SampleLogprobs | None]: """Sanitize vllm output [qwen models] to be comparable with hf output.""" output_ids, output_str, out_logprobs = vllm_output @@ -69,7 +68,7 @@ def qwen_vllm_to_hf_output( def qwen2_vllm_to_hf_output( vllm_output: RunnerOutput, model: str -) -> tuple[list[int], str, Optional[SampleLogprobs]]: +) -> tuple[list[int], str, SampleLogprobs | None]: """Sanitize vllm output [qwen2 models] to be comparable with hf output.""" output_ids, output_str, out_logprobs = vllm_output @@ -80,7 +79,7 @@ def qwen2_vllm_to_hf_output( def kimiv_vl_vllm_to_hf_output( vllm_output: RunnerOutput, model: str -) -> tuple[list[int], str, Optional[SampleLogprobs]]: +) -> tuple[list[int], str, SampleLogprobs | None]: """Sanitize vllm output [kimi_vl models] to be comparable with hf output.""" output_ids, output_str, out_logprobs = vllm_output @@ -99,7 +98,7 @@ def llava_image_vllm_to_hf_output( def llava_video_vllm_to_hf_output( vllm_output: RunnerOutput, model: str -) -> tuple[list[int], str, Optional[SampleLogprobs]]: +) -> tuple[list[int], str, SampleLogprobs | None]: config = AutoConfig.from_pretrained(model) mm_token_id = config.video_token_index return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id) @@ -263,7 +262,7 @@ def get_llava_embeddings(image_assets: ImageTestAssets): ####### Prompt path encoders for models that need models on disk def qwen_prompt_path_encoder( - tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets] + tmp_path: PosixPath, prompt: str, assets: list[ImageAsset] | ImageTestAssets ) -> str: """Given a temporary dir path, export one or more image assets into the tempdir & replace its contents with the local path to the string so that @@ -440,7 +439,7 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size - def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs): + def __call__(self, text: str, images: Image | list[Image], **kwargs): from vllm.model_executor.models.h2ovl import ( IMG_CONTEXT, IMG_END, @@ -499,7 +498,7 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner: self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size - def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs): + def __call__(self, text: str, images: Image | list[Image], **kwargs): from vllm.model_executor.models.skyworkr1v import ( IMG_CONTEXT, IMG_END, @@ -560,8 +559,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: def __call__( self, text: str, - images: Union[Image, list[Image]] = None, - videos: Union[npt.NDArray, list[npt.NDArray]] = None, + images: Image | list[Image] = None, + videos: npt.NDArray | list[npt.NDArray] = None, **kwargs, ): from vllm.model_executor.models.internvl import ( @@ -650,7 +649,7 @@ def _internvl_generate( self, pixel_values: torch.FloatTensor, input_ids: torch.FloatTensor, - attention_mask: Optional[torch.LongTensor] = None, + attention_mask: torch.LongTensor | None = None, **generate_kwargs, ) -> torch.LongTensor: """Generate method for InternVL2 model without fixed use_cache.""" diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py index 6e82f7e3306ab..fe02f71884324 100644 --- a/tests/models/multimodal/generation/vlm_utils/types.py +++ b/tests/models/multimodal/generation/vlm_utils/types.py @@ -2,10 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Types for writing multimodal model tests.""" -from collections.abc import Iterable +from collections.abc import Callable, Iterable from enum import Enum from pathlib import PosixPath -from typing import Any, Callable, NamedTuple, Optional, Union +from typing import Any, NamedTuple import torch from pytest import MarkDecorator @@ -52,16 +52,16 @@ VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?" IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)] EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)] -RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]] +RunnerOutput = tuple[list[int], str, SampleLogprobs | None] class PromptWithMultiModalInput(NamedTuple): """Holds the multimodal input for a single test case.""" prompts: list[str] - image_data: Optional[PromptImageInput] = None - video_data: Optional[PromptVideoInput] = None - audio_data: Optional[PromptAudioInput] = None + image_data: PromptImageInput | None = None + video_data: PromptVideoInput | None = None + audio_data: PromptAudioInput | None = None class VLMTestType(Enum): @@ -87,17 +87,17 @@ class ImageSizeWrapper(NamedTuple): type: SizeType # A size factor is a wrapper of 0+ floats, # while a fixed size contains an iterable of integer pairs - data: Union[Iterable[float], Iterable[tuple[int, int]]] + data: Iterable[float] | Iterable[tuple[int, int]] class VLMTestInfo(NamedTuple): """Holds the configuration for 1+ tests for one model architecture.""" models: list[str] - test_type: Union[VLMTestType, Iterable[VLMTestType]] + test_type: VLMTestType | Iterable[VLMTestType] # Should be None only if this is a CUSTOM_INPUTS test - prompt_formatter: Optional[Callable[[str], str]] = None + prompt_formatter: Callable[[str], str] | None = None img_idx_to_prompt: Callable[[int], str] = lambda idx: "\n" video_idx_to_prompt: Callable[[int], str] = lambda idx: "