[CI/Build] drop support for Python 3.8 EOL (#8464)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2026-07-25 17:27:22 +08:00 · 2024-11-06 02:11:55 -05:00 · 2024-11-06 02:11:55 -05:00 · 21063c11c7
commit 21063c11c7
parent 4be3a45158
115 changed files with 239 additions and 321 deletions
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -56,7 +56,7 @@ serving_column_mapping = {
 def read_markdown(file):
    if os.path.exists(file):
-        with open(file, "r") as f:
+        with open(file) as f:
            return f.read() + "\n"
    else:
        return f"{file} not found.\n"
@ -75,14 +75,14 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            raw_result = json.loads(f.read())
        if "serving" in str(test_file):
            # this result is generated via `benchmark_serving.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)
@ -97,7 +97,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_latency.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)
@ -119,7 +119,7 @@ if __name__ == "__main__":
            # this result is generated via `benchmark_throughput.py`
            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                command = json.loads(f.read())
            raw_result.update(command)
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -72,7 +72,7 @@ def main(args):
    # collect results
    for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            results = results + json.loads(f.read())
    # generate markdown table
@ -80,7 +80,7 @@ def main(args):
    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
-    with open(args.description, "r") as f:
+    with open(args.description) as f:
        description = f.read()
    description = description.format(
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -36,11 +36,11 @@ if __name__ == "__main__":
    # collect results
    for test_file in results_folder.glob("*.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
            raw_result = json.loads(f.read())
        # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
+        with open(test_file.with_suffix(".commands")) as f:
            command = json.loads(f.read())
        raw_result.update(command)
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -25,7 +25,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
    steps:
    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
    - name: Set up Python ${{ matrix.python-version }}
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -48,7 +48,7 @@ jobs:
      fail-fast: false
      matrix:
          os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          python-version: ['3.9', '3.10', '3.11', '3.12']
          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
          cuda-version: ['11.8', '12.1']
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -29,19 +29,19 @@ jobs:
      matrix:
        python-version: ["3.12"]
    steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-    - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-      with:
+        with:
-        python-version: ${{ matrix.python-version }}
+          python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
+      - name: Install dependencies
-      run: |
+        run: |
-        python -m pip install --upgrade pip
+          python -m pip install --upgrade pip
-        pip install -r requirements-lint.txt
+          pip install -r requirements-lint.txt
-    - name: Analysing the code with ruff
+      - name: Analysing the code with ruff
-      run: |
+        run: |
-        echo "::add-matcher::.github/workflows/matchers/ruff.json"
+          echo "::add-matcher::.github/workflows/matchers/ruff.json"
-        ruff check --output-format github .
+          ruff check --output-format github .
-    - name: Run isort
+      - name: Run isort
-      run: |
+        run: |
-        isort . --check-only
+          isort . --check-only
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@ -23,16 +23,16 @@ jobs:
      matrix:
        python-version: ["3.12"]
    steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-    - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-      with:
+        with:
-        python-version: ${{ matrix.python-version }}
+          python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
+      - name: Install dependencies
-      run: |
+        run: |
-        python -m pip install --upgrade pip
+          python -m pip install --upgrade pip
-        pip install yapf==0.32.0
+          pip install yapf==0.32.0
-        pip install toml==0.10.2
+          pip install toml==0.10.2
-    - name: Running yapf
+      - name: Running yapf
-      run: |
+        run: |
-        yapf --diff --recursive .
+          yapf --diff --recursive .
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -6,17 +6,16 @@ version: 2
 build:
  os: ubuntu-22.04
  tools:
-    python: "3.8"
+    python: '3.9'
 sphinx:
-   configuration: docs/source/conf.py
+  configuration: docs/source/conf.py
-   fail_on_warning: true
+  fail_on_warning: true
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats: []
 # Optionally declare the Python requirements required to build your docs
 python:
-   install:
+  install:
-   - requirements: docs/requirements-docs.txt
+    - requirements: docs/requirements-docs.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -128,9 +128,9 @@ endif()
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  #
-  # For cuda we want to be able to control which architectures we compile for on 
+  # For cuda we want to be able to control which architectures we compile for on
  # a per-file basis in order to cut down on compile time. So here we extract
-  # the set of architectures we want to compile for and remove the from the 
+  # the set of architectures we want to compile for and remove the from the
  # CMAKE_CUDA_FLAGS so that they are not applied globally.
  #
  clear_cuda_arches(CUDA_ARCH_FLAGS)
@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
  # Filter the target architectures by the supported supported archs
  # since for some files we will build for all CUDA_ARCHS.
-  cuda_archs_loose_intersection(CUDA_ARCHS 
+  cuda_archs_loose_intersection(CUDA_ARCHS
    "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
  message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
 else()
@ -236,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # are not supported by Machete yet.
  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
  if (MARLIN_ARCHS)
-    set(MARLIN_SRCS 
+    set(MARLIN_SRCS
       "csrc/quantization/fp8/fp8_marlin.cu"
       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
@ -277,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                     "in CUDA target architectures")
    endif()
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't 
+    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
    # build any 3x kernels
    set(SCALED_MM_3X_ARCHS)
  endif()
@ -285,7 +285,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  #
  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
  # kernels for the remaining archs that are not already built for 3x.
-  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS 
+  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
    "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
@ -316,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
    #
-    # For the Machete kernels we automatically generate sources for various 
+    # For the Machete kernels we automatically generate sources for various
    # preselected input type pairs and schedules.
    # Generate sources:
-    set(MACHETE_GEN_SCRIPT 
+    set(MACHETE_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
    file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
@ -329,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
        OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
      execute_process(
-        COMMAND ${CMAKE_COMMAND} -E env 
+        COMMAND ${CMAKE_COMMAND} -E env
-        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH 
+        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
          ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
        RESULT_VARIABLE machete_generation_result
        OUTPUT_VARIABLE machete_generation_output
@ -340,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      if (NOT machete_generation_result EQUAL 0)
        message(FATAL_ERROR "Machete generation failed."
-                            " Result: \"${machete_generation_result}\"" 
+                            " Result: \"${machete_generation_result}\""
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
      else()
-        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} 
+        set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
            CACHE STRING "Last run machete generate script hash" FORCE)
        message(STATUS "Machete generation completed successfully.")
      endif()
@ -366,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
  else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
        AND MACHETE_ARCHS)
      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@ -392,8 +392,8 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)
-# If CUTLASS is compiled on NVCC >= 12.5, it by default uses 
+# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
-# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the 
+# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
 # driver API. This causes problems when linking with earlier versions of CUDA.
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
  return()
 endif ()
-# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target  
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
-# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the 
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
-# arches in the CUDA case (and instead set the gencodes on a per file basis) 
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
 # we need to manually set VLLM_GPU_ARCHES here.
 if(VLLM_GPU_LANG STREQUAL "CUDA")
  foreach(_ARCH ${CUDA_ARCHS})
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -79,7 +79,7 @@ async def async_request_tgi(
                        # any data, we should skip it.
                        if chunk_bytes.startswith(":"):
                            continue
-                        chunk = remove_prefix(chunk_bytes, "data:")
+                        chunk = chunk_bytes.removeprefix("data:")
                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
@ -144,8 +144,8 @@ async def async_request_trt_llm(
                        if not chunk_bytes:
                            continue
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                                              "data:")
+                            "data:")
                        data = json.loads(chunk)
                        output.generated_text += data["text_output"]
@ -261,8 +261,8 @@ async def async_request_openai_completions(
                        if not chunk_bytes:
                            continue
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                                              "data: ")
+                            "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
                        else:
@ -349,8 +349,8 @@ async def async_request_openai_chat_completions(
                        if not chunk_bytes:
                            continue
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
-                                              "data: ")
+                            "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
                        else:
@ -389,14 +389,6 @@ async def async_request_openai_chat_completions(
    return output
 # Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
 # introduced in Python 3.9
 def remove_prefix(text: str, prefix: str) -> str:
    if text.startswith(prefix):
        return text[len(prefix):]
    return text
 def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -269,10 +269,10 @@ def run_square_bench(args):
 def run_range_bench(args):
-    m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
+    m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
-    m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
+    m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
    m_increment, k_increment, n_increment = \
-        [int(x) for x in args.dim_increment.split(",")]
+        (int(x) for x in args.dim_increment.split(","))
    Ms = list(range(m_start, m_end + 1, m_increment))
    Ks = list(range(k_start, k_end + 1, k_increment))
    Ns = list(range(n_start, n_end + 1, n_increment))
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -468,7 +468,7 @@ def generate():
    impl_configs = []
    GPTQ_kernel_type_configs = list(
-        (TypeConfig(
+        TypeConfig(
            element_a=element_a,
            element_b=element_b,
            element_b_scale=element_a,
@ -476,7 +476,7 @@ def generate():
            element_d=element_a,
            accumulator=DataType.f32,
        ) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
-         for element_a in (DataType.f16, DataType.bf16)))
+        for element_a in (DataType.f16, DataType.bf16))
    GPTQ_kernel_specializations = [
        Specialization(with_C=False, with_zeropoints=False, with_scales=True)
@ -490,7 +490,7 @@ def generate():
    ]
    AWQ_kernel_type_configs = list(
-        (TypeConfig(
+        TypeConfig(
            element_a=element_a,
            element_b=element_b,
            element_b_scale=element_a,
@ -498,7 +498,7 @@ def generate():
            element_d=element_a,
            accumulator=DataType.f32,
        ) for element_b in (DataType.u4, DataType.u8)
-         for element_a in (DataType.f16, DataType.bf16)))
+        for element_a in (DataType.f16, DataType.bf16))
    AWQ_kernel_specializations = [
        Specialization(with_C=False, with_zeropoints=True, with_scales=True)
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@ -10,7 +10,7 @@ Requirements
 ============
 * OS: Linux
-* Python: 3.8 - 3.12
+* Python: 3.9 -- 3.12
 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 Install released versions
@ -148,7 +148,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
 .. tip::
    Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
-    For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` . 
+    For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
    As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
@ -181,8 +181,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example
    $ export MAX_JOBS=6
    $ pip install -e .
-This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. 
+This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
-A side effect is a much slower build process. 
+A side effect is a much slower build process.
 Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
@ -209,7 +209,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
 Unsupported OS build
 --------------------
-vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. 
+vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
 Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -34,7 +34,7 @@ select = [
    # Pyflakes
    "F",
    # pyupgrade
-    # "UP",
+    "UP",
    # flake8-bugbear
    "B",
    # flake8-simplify
@ -55,7 +55,7 @@ ignore = [
 ]
 [tool.mypy]
-python_version = "3.8"
+python_version = "3.9"
 ignore_missing_imports = true
 check_untyped_defs = true
--- a/setup.py
+++ b/setup.py
@ -1,5 +1,4 @@
 import importlib.util
 import io
 import logging
 import os
 import re
@ -327,7 +326,7 @@ def get_neuronxcc_version():
                                "__init__.py")
    # Check if the command was executed successfully
-    with open(version_file, "rt") as fp:
+    with open(version_file) as fp:
        content = fp.read()
    # Extract the version using a regular expression
@ -404,7 +403,8 @@ def read_readme() -> str:
    """Read the README file if present."""
    p = get_path("README.md")
    if os.path.isfile(p):
-        return io.open(get_path("README.md"), "r", encoding="utf-8").read()
+        with open(get_path("README.md"), encoding="utf-8") as f:
            return f.read()
    else:
        return ""
@ -498,7 +498,6 @@ setup(
        "Documentation": "https://vllm.readthedocs.io/en/latest/",
    },
    classifiers=[
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
@ -512,7 +511,7 @@ setup(
    ],
    packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
                                    "tests*")),
-    python_requires=">=3.8",
+    python_requires=">=3.9",
    install_requires=get_requirements(),
    ext_modules=ext_modules,
    extras_require={
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@ -429,8 +429,8 @@ def benchmark():
    # print in tabular format
    print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph")
    for b in cudagraph_sizes:
-        print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
+        print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}"
-               f"\t{piecewise_cudagraph_time[b]:.3f}"))
+              f"\t{piecewise_cudagraph_time[b]:.3f}")
 if __name__ == "__main__":
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1,6 +1,5 @@
 import json
 import os
 import sys
 import tempfile
 from collections import UserList
 from enum import Enum
@ -52,7 +51,7 @@ PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 def _read_prompts(filename: str) -> List[str]:
-    with open(filename, "r") as f:
+    with open(filename) as f:
        prompts = f.readlines()
        return prompts
@ -62,14 +61,8 @@ class _ImageAssetPrompts(TypedDict):
    cherry_blossom: str
-if sys.version_info < (3, 9):
+class _ImageAssetsBase(UserList[ImageAsset]):
-    # UserList cannot be subscripted
+    pass
    class _ImageAssetsBase(UserList):
        pass
 else:
    class _ImageAssetsBase(UserList[ImageAsset]):
        pass
 class _ImageAssets(_ImageAssetsBase):
@ -94,14 +87,8 @@ class _VideoAssetPrompts(TypedDict):
    sample_demo_1: str
-if sys.version_info < (3, 9):
+class _VideoAssetsBase(UserList[VideoAsset]):
-    # UserList cannot be subscripted
+    pass
    class _VideoAssetsBase(UserList):
        pass
 else:
    class _VideoAssetsBase(UserList[VideoAsset]):
        pass
 class _VideoAssets(_VideoAssetsBase):
@ -958,7 +945,7 @@ def dummy_opt_path():
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
            config = json.load(f)
        config["architectures"] = ["MyOPTForCausalLM"]
        with open(json_path, "w") as f:
@ -977,7 +964,7 @@ def dummy_llava_path():
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
            config = json.load(f)
        config["architectures"] = ["MyLlava"]
        with open(json_path, "w") as f:
@ -996,7 +983,7 @@ def dummy_gemma2_embedding_path():
                              "*.msgpack"
                          ])
        assert os.path.exists(json_path)
-        with open(json_path, "r") as f:
+        with open(json_path) as f:
            config = json.load(f)
        config["architectures"] = ["MyGemma2Embedding"]
        with open(json_path, "w") as f:
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@ -99,13 +99,11 @@ class TestPrefixCachingBlock:
        token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
-        first_chain, second_chain = [
+        first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
-            TestPrefixCachingBlock.create_chain(
+            block_size=block_size,
-                block_size=block_size,
+            token_ids=token_ids,
-                token_ids=token_ids,
+            num_empty_trailing_blocks=num_empty_trailing_blocks)
-                num_empty_trailing_blocks=num_empty_trailing_blocks)
+                                     for _ in range(2))
            for _ in range(2)
        ]
        for first_chain_block, second_chain_block in zip(
                first_chain, second_chain):
--- a/tests/kernels/test_mamba_ssm.py
+++ b/tests/kernels/test_mamba_ssm.py
@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
        for var in (u_ref, delta_ref, B_ref, C_ref, z_ref)
    ]
    for i in range(len(seqlens[0])):
-        u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits]
+        u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits)
        if padded_state_indices[i] == PAD_SLOT_ID:
            continue
        out_ref_s, _ = selective_scan_ref(
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
 # Sad path tests for the multimodal input processor and mapper, respectively
@pytest.mark.parametrize("mm_data", [
    {
-        "image": torch.rand((5))
+        "image": torch.rand(5)
    },
    {
        "image": torch.rand((5, 5, 5, 5, 5))
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@ -413,12 +413,10 @@ class _CorrectnessTestHelper:
    def generate_probs_for_test(
        self, draft_and_target_probs_equal: bool
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        draft_probs, target_probs = [
+        draft_probs, target_probs = (F.softmax(
-            F.softmax(
+            torch.rand(self.vocab_size, dtype=torch.float32),
-                torch.rand(self.vocab_size, dtype=torch.float32),
+            dim=-1,
-                dim=-1,
+        ) for _ in range(2))
            ) for _ in range(2)
        ]
        num_reference_probs = 100
        reference_probs = F.softmax(
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@ -29,7 +29,7 @@ def test_trace_function_call():
    cur_dir = os.path.dirname(__file__)
    enable_trace_function_call(path, cur_dir)
    f1(1)
-    with open(path, 'r') as f:
+    with open(path) as f:
        content = f.read()
    assert "f1" in content
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@ -93,10 +93,10 @@ def test_mistral_edge_case(tokenizer, truth):
 def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]:
    if "mistral" in tokenizer_name:
        yield (
-            bool(True) if request.param else
+            True if request.param else
            pytest.skip("mistral doesn't support skip_special_tokens=False"))
    else:
-        yield bool(True) if request.param else bool(False)
+        yield bool(request.param)
@pytest.mark.parametrize("truth", TRUTH)
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@ -46,7 +46,7 @@ if __name__ == "__main__":
    args = parser.parse_args()
-    with open(args.json_trace, "r") as f:
+    with open(args.json_trace) as f:
        profile_data = json.load(f)
    if args.table == "summary":
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@ -434,7 +434,7 @@ def main(
                f"{', Sparsity ' + sparsity if sparsity else ''}")
    profile_json = None
-    with open(json_trace, "r") as f:
+    with open(json_trace) as f:
        profile_json = json.load(f)
    assert profile_json is not None
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@ -81,7 +81,7 @@ class Target:
        # Allow for modest floating-point errors
        epsilon = 0.000002
        if (self.weighted_duration > self.Duration() + epsilon):
-            print('%s > %s?' % (self.weighted_duration, self.Duration()))
+            print('{} > {}?'.format(self.weighted_duration, self.Duration()))
        assert (self.weighted_duration <= self.Duration() + epsilon)
        return self.weighted_duration
@ -104,7 +104,7 @@ def ReadTargets(log, show_all):
    The result is a list of Target objects."""
    header = log.readline()
    assert header == '# ninja log v5\n', \
-           'unrecognized ninja log version %r' % header
+           'unrecognized ninja log version {!r}'.format(header)
    targets_dict = {}
    last_end_seen = 0.0
    for line in log:
@ -254,8 +254,8 @@ def SummarizeEntries(entries, extra_step_types):
    # Warn if the sum of weighted times is off by more than half a second.
    if abs(length - weighted_total) > 500:
        print('Warning: Possible corrupt ninja log, results may be '
-              'untrustworthy. Length = %.3f, weighted total = %.3f' %
+              'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format(
-              (length, weighted_total))
+                  length, weighted_total))
    entries_by_ext = defaultdict(list)
    for target in entries:
@ -263,16 +263,17 @@ def SummarizeEntries(entries, extra_step_types):
        entries_by_ext[extension].append(target)
    for key, values in entries_by_ext.items():
-        print('    Longest build steps for %s:' % key)
+        print('    Longest build steps for {}:'.format(key))
        values.sort(key=lambda x: x.WeightedDuration())
        for target in values[-long_count:]:
-            print('      %8.1f weighted s to build %s (%.1f s elapsed time)' %
+            print(
-                  (target.WeightedDuration(), target.DescribeTargets(),
+                '      {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'.
-                   target.Duration()))
+                format(target.WeightedDuration(), target.DescribeTargets(),
                       target.Duration()))
-    print('    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
+    print('    {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x '
-          'parallelism)' %
+          'parallelism)'.format(length, total_cpu_time,
-          (length, total_cpu_time, total_cpu_time * 1.0 / length))
+                                total_cpu_time * 1.0 / length))
    print('    %d build steps completed, average of %1.2f/s' %
          (len(entries), len(entries) / (length)))
@ -298,11 +299,12 @@ def main():
        long_ext_count += len(args.step_types.split(';'))
    try:
-        with open(log_file, 'r') as log:
+        with open(log_file) as log:
            entries = ReadTargets(log, False)
            SummarizeEntries(entries, args.step_types)
-    except IOError:
+    except OSError:
-        print('Log file %r not found, no build summary created.' % log_file)
+        print('Log file {!r} not found, no build summary created.'.format(
            log_file))
        return errno.ENOENT
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@ -4,7 +4,7 @@ requires_files = glob.glob('requirements*.txt')
 requires_files += ["pyproject.toml"]
 for file in requires_files:
    print(f">>> cleaning {file}")
-    with open(file, 'r') as f:
+    with open(file) as f:
        lines = f.readlines()
    if "torch" in "".join(lines).lower():
        print("removed:")
--- a/vllm/attention/ops/blocksparse_attention/interface.py
+++ b/vllm/attention/ops/blocksparse_attention/interface.py
@ -192,10 +192,8 @@ class LocalStridedBlockSparseAttn(torch.nn.Module):
        attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen]
        q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1)
-        k2, v2 = [
+        k2, v2 = (self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
-            self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio)
+                  for x in [k, v])
            for x in [k, v]
        ]
        spda_output = torch.nn.functional.scaled_dot_product_attention(
            q2, k2, v2, attn_mask=attn_mask, scale=sm_scale)
        return self.transpose_and_unpad(spda_output, cu_seqlens)
--- a/vllm/config.py
+++ b/vllm/config.py
@ -668,9 +668,10 @@ class ModelConfig:
    @property
    def is_encoder_decoder_model(self) -> bool:
        """Extract the HF encoder/decoder model flag."""
-        return getattr(self.hf_config, "is_encoder_decoder", False) or (
+        return getattr(
-            (hasattr(self.hf_config, "text_config") and getattr(
+            self.hf_config, "is_encoder_decoder",
-                self.hf_config.text_config, "is_encoder_decoder", False)))
+            False) or (hasattr(self.hf_config, "text_config") and getattr(
                self.hf_config.text_config, "is_encoder_decoder", False))
    @property
    def is_multimodal_model(self) -> bool:
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@ -52,7 +52,7 @@ class Evictor(ABC):
        pass
-class BlockMetaData():
+class BlockMetaData:
    """Data structure for storing key data describe cached block, so that
    evitor could use to make its decision which one to choose for eviction
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@ -240,7 +240,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
    if is_distributed:
        get_world_group().barrier()
    logger.info("reading GPU P2P access cache from %s", path)
-    with open(path, "r") as f:
+    with open(path) as f:
        cache = json.load(f)
    _gpu_p2p_access_cache = cache
    return _gpu_p2p_access_cache[f"{src}->{tgt}"]
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@ -812,7 +812,7 @@ class AsyncLLMEngine(EngineClient):
    async def run_engine_loop(engine_ref: ReferenceType):
        """We use a weakref to the engine so that the running loop
        doesn't prevent the engine being garbage collected."""
-        engine: Optional["AsyncLLMEngine"] = engine_ref()
+        engine: Optional[AsyncLLMEngine] = engine_ref()
        if not engine:
            return
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -1541,8 +1541,8 @@ class LLMEngine:
                seq_group.state.remaining_steps != ref_remaining_steps
                for seq_group in seq_group_metadata_list[1:]
        ]):
-            raise AssertionError(("All running sequence groups should "
+            raise AssertionError("All running sequence groups should "
-                                  "have the same remaining steps."))
+                                 "have the same remaining steps.")
        return ref_remaining_steps > 0
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@ -77,7 +77,7 @@ class StatLoggerBase(ABC):
        self.num_generation_tokens: List[int] = []
        self.last_local_log = time.time()
        self.local_interval = local_interval
-        self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
+        self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None
    @abstractmethod
    def log(self, stats: Stats) -> None:
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@ -63,7 +63,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
            single_step_process_prompt_logprob(self, seq_group, output)
    @staticmethod
-    @functools.lru_cache()
+    @functools.lru_cache
    def _log_prompt_logprob_unsupported_warning_once():
        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
        # If the feature combo become valid
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@ -362,7 +362,7 @@ def load_chat_template(
    if chat_template is None:
        return None
    try:
-        with open(chat_template, "r") as f:
+        with open(chat_template) as f:
            resolved_chat_template = f.read()
    except OSError as e:
        if isinstance(chat_template, Path):
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@ -120,7 +120,7 @@ async def read_file(path_or_url: str) -> str:
                   session.get(path_or_url) as resp:
            return await resp.text()
    else:
-        with open(path_or_url, "r", encoding="utf-8") as f:
+        with open(path_or_url, encoding="utf-8") as f:
            return f.read()
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@ -32,7 +32,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
    uses_ray: bool = True
    def _init_executor(self) -> None:
-        self.forward_dag: Optional["ray.dag.CompiledDAG"] = None
+        self.forward_dag: Optional[ray.dag.CompiledDAG] = None
        # If the env var is set, it uses the Ray's compiled DAG API
        # which optimizes the control plane overhead.
        # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
--- a/vllm/logger.py
+++ b/vllm/logger.py
@ -67,8 +67,7 @@ def _configure_vllm_root_logger() -> None:
            raise RuntimeError(
                "Could not load logging config. File does not exist: %s",
                VLLM_LOGGING_CONFIG_PATH)
-        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8",
+        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
                  mode="r") as file:
            custom_config = json.loads(file.read())
        if not isinstance(custom_config, dict):
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@ -343,7 +343,7 @@ class LoRAModelManager(AdapterModelManager):
            # text modules (e.g. ChatGLM)
            and hasattr(self.model, "get_mm_mapping"))
        self.packed_modules: Dict[str, List[str]] = {}
-        self.modules: Dict[str, "BaseLayerWithLoRA"] = {}
+        self.modules: Dict[str, BaseLayerWithLoRA] = {}
        # Dict instead of a Set for compatibility with LRUCache.
        self._last_mapping: Optional[LoRAMapping] = None
        self._create_lora_modules()
@ -548,7 +548,7 @@ class LoRAModelManager(AdapterModelManager):
            else:
                parts = module_name.split(".")
                replacements = self.packed_modules_mapping[parts[-1]]
-                subloras: List[Optional["LoRALayerWeights"]] = []
+                subloras: List[Optional[LoRALayerWeights]] = []
                for i, r in enumerate(replacements):
                    lora = LoRALayerWeights.create_dummy_lora_weights(
                        module_name + "." + r,
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@ -103,7 +103,7 @@ class CustomOp(nn.Module):
    # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE
    # Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence.
    @staticmethod
-    @lru_cache()
+    @lru_cache
    def default_on() -> bool:
        count_none = envs.VLLM_CUSTOM_OPS.count("none")
        count_all = envs.VLLM_CUSTOM_OPS.count("all")
--- a/vllm/model_executor/layers/resampler.py
+++ b/vllm/model_executor/layers/resampler.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@ -746,7 +746,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
        config_file_path = self._get_config_file(qlora_adapter)
-        with open(config_file_path, "r") as f:
+        with open(config_file_path) as f:
            config = json.load(f)
            self.target_modules = config["target_modules"]
--- a/vllm/model_executor/model_loader/openvino.py
+++ b/vllm/model_executor/model_loader/openvino.py
@ -190,7 +190,7 @@ def get_model(
    kv_cache_dtype: ov.Type,
    **kwargs,
 ) -> torch.nn.Module:
-    lora_config = kwargs.get("lora_config", None)
+    lora_config = kwargs.get("lora_config")
    ov_core = kwargs.get("ov_core")
    if lora_config:
        raise ValueError(
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@ -280,7 +280,7 @@ class TensorizerAgent:
        self.tensorizer_args = (
            self.tensorizer_config._construct_tensorizer_args())
        self.extra_kwargs = extra_kwargs
-        if extra_kwargs.get("quant_config", None) is not None:
+        if extra_kwargs.get("quant_config") is not None:
            self.quant_config = extra_kwargs["quant_config"]
        else:
            self.quant_config = quant_config
@ -380,8 +380,7 @@ def tensorizer_weights_iterator(
    stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
    with TensorDeserializer(stream, **deserializer_args,
                            device="cpu") as state:
-        for name, param in state.items():
+        yield from state.items()
            yield name, param
    del state
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@ -188,7 +188,7 @@ def get_quant_config(model_config: ModelConfig,
            f"{quant_config_files}")
    quant_config_file = quant_config_files[0]
-    with open(quant_config_file, "r") as f:
+    with open(quant_config_file) as f:
        config = json.load(f)
        if model_config.quantization == "bitsandbytes":
@ -306,7 +306,7 @@ def filter_duplicate_safetensors_files(hf_weights_files: List[str],
    # Iterate through the weight_map (weight_name: safetensors files)
    # to identify weights that we should use.
-    with open(index_file_name, "r") as f:
+    with open(index_file_name) as f:
        weight_map = json.load(f)["weight_map"]
    weight_files_in_index = set()
    for weight_name in weight_map:
@ -382,7 +382,7 @@ def np_cache_weights_iterator(
            with open(weight_names_file, "w") as f:
                json.dump(weight_names, f)
-    with open(weight_names_file, "r") as f:
+    with open(weight_names_file) as f:
        weight_names = json.load(f)
    for name in weight_names:
@ -423,8 +423,7 @@ def pt_weights_iterator(
            bar_format=_BAR_FORMAT,
    ):
        state = torch.load(bin_file, map_location="cpu")
-        for name, param in state.items():
+        yield from state.items()
            yield name, param
        del state
        torch.cuda.empty_cache()
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@ -48,7 +48,7 @@ class ArcticMLP(nn.Module):
                 is_residual_mlp: bool = False,
                 quant_config: Optional[QuantizationConfig] = None,
                 reduce_results: bool = True):
-        super(ArcticMLP, self).__init__()
+        super().__init__()
        self.hidden_size = config.hidden_size
        self.expert_id = expert_id
        self.layer_id = layer_id
@ -89,7 +89,7 @@ class ArcticMoE(nn.Module):
                 params_dtype: Optional[torch.dtype] = None,
                 quant_config: Optional[QuantizationConfig] = None,
                 reduce_results: bool = True):
-        super(ArcticMoE, self).__init__()
+        super().__init__()
        self.tp_size = tp_size or get_tensor_model_parallel_world_size()
        self.hidden_size = config.hidden_size
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/THUDM/GLM-4
 """Inference-only ChatGLM model compatible with THUDM weights."""
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@ -1,4 +1,3 @@
 # coding=utf-8
 from typing import Iterable, List, Optional, Tuple, Union
 import torch
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 DeciAI Research Team. All rights reserved.
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py
 # Copyright 2024 The LG U+ CTO AI Tech Lab.
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
 # Copyright 2023 The vLLM team.
 # Copyright 2023 HuggingFace Inc. team. All rights reserved.
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Copyright 2023 The vLLM team.
 # Copyright (c) Google Inc.
 #
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
 #
--- a/vllm/model_executor/models/glm4_vision_encoder.py
+++ b/vllm/model_executor/models/glm4_vision_encoder.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/THUDM/GLM-4
 """Inference-only GLM-4v model visual encoder compatible with THUDM weights."""
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@ -1,5 +1,3 @@
 # coding=utf-8
 # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
 # Copyright 2024 The vLLM team.
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@ -1,4 +1,3 @@
 # -*- coding: utf-8 -*-
 from functools import partial
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@ -1,4 +1,3 @@
 # -*- coding: utf-8 -*-
 from typing import List, Optional, Tuple, Union
 import torch
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@ -1,4 +1,3 @@
 # coding=utf-8
 """Inference-only Jamba model."""
 from typing import Iterable, List, Optional, Tuple
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@ -1,4 +1,3 @@
 # coding=utf-8
 """PyTorch MAMBA model."""
 from typing import Iterable, List, Optional, Tuple
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2024 The ModelBest team.
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@ -37,7 +37,7 @@ class MLPSpeculatorLayerNorm(nn.Module):
        eps=1e-06,
        elementwise_scale_and_shift=True,
    ):
-        super(MLPSpeculatorLayerNorm, self).__init__()
+        super().__init__()
        self.elementwise_scale_and_shift = elementwise_scale_and_shift
        if self.elementwise_scale_and_shift:
            self.weight = nn.Parameter(torch.empty(normalized_shape))
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@ -1121,9 +1121,9 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
            batch_size * num_image * num_patch, -1).contiguous()
        image_input_idx = image_input_idx * valid.to(image_input_idx.dtype)
-        offset = torch.cat(
+        offset = torch.cat([seq_len.new_zeros(1),
-            [seq_len.new_zeros(
+                            seq_len.cumsum(dim=0)[:-1]],
-                (1)), seq_len.cumsum(dim=0)[:-1]], dim=0)[:, None]
+                           dim=0)[:, None]
        image_input_idx = image_input_idx + offset.to(image_input_idx.dtype)
        image_input_idx = image_input_idx.flatten()[:, None]
        mat = image_input_idx == torch.arange(
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
 from typing import Iterable, List, Optional, Tuple, Union
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
 # Copyright 2024 The vLLM team.
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
 # Copyright (c) OrionStar Inc.
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
 # Copyright 2023 The vLLM team.
 # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/phi3.py
+++ b/vllm/model_executor/models/phi3.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from llama.py
 """Inference-only Phi3 model code inherit from Llama.py"""
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Copyright 2024 The vLLM team.
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@ -136,11 +136,11 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
        if image_token_id not in inputs['prompt_token_ids']:
            raise ValueError(
-                (f"You've passed {inputs=} without {image_token_id=}"
+                f"You've passed {inputs=} without {image_token_id=}"
-                 " Make sure to process your input via mistral_common's"
+                " Make sure to process your input via mistral_common's"
-                 " tokenizer or pass a chat completion request. For more"
+                " tokenizer or pass a chat completion request. For more"
-                 " For more info, see: "
+                " For more info, see: "
-                 "https://github.com/vllm-project/vllm/issues/8411."))
+                "https://github.com/vllm-project/vllm/issues/8411.")
    return inputs
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
 # Copyright (c) Alibaba Cloud.
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
 # Copyright 2024 The Qwen team.
@ -417,9 +416,9 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                and hasattr(config, "max_window_layers")):
            raise ValueError("Sliding window for some but all layers is not "
                             "supported. This model uses sliding window "
-                             "but `max_window_layers` = %s is less than "
+                             "but `max_window_layers` = {} is less than "
-                             "`num_hidden_layers` = %s. Please open an issue "
+                             "`num_hidden_layers` = {}. Please open an issue "
-                             "to discuss this feature." % (
+                             "to discuss this feature.".format(
                                 config.max_window_layers,
                                 config.num_hidden_layers,
                             ))
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
--- a/vllm/model_executor/models/qwen2_cls.py
+++ b/vllm/model_executor/models/qwen2_cls.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
 # Copyright 2024 Kakao Corp. (Kanana-X Team)
@ -60,9 +59,9 @@ class Qwen2ForSequenceClassification(nn.Module):
                and hasattr(config, "max_window_layers")):
            raise ValueError("Sliding window for some but all layers is not "
                             "supported. This model uses sliding window "
-                             "but `max_window_layers` = %s is less than "
+                             "but `max_window_layers` = {} is less than "
-                             "`num_hidden_layers` = %s. Please open an issue "
+                             "`num_hidden_layers` = {}. Please open an issue "
-                             "to discuss this feature." % (
+                             "to discuss this feature.".format(
                                 config.max_window_layers,
                                 config.num_hidden_layers,
                             ))
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
 # Copyright 2024 The Qwen team.
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@ -1,4 +1,3 @@
 # coding=utf-8
 # Adapted from
 # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py
 # Copyright 2024 The Qwen team.
@ -71,9 +70,9 @@ class Qwen2ForRewardModel(nn.Module, SupportsPP):
                and hasattr(config, "max_window_layers")):
            raise ValueError("Sliding window for some but all layers is not "
                             "supported. This model uses sliding window "
-                             "but `max_window_layers` = %s is less than "
+                             "but `max_window_layers` = {} is less than "
-                             "`num_hidden_layers` = %s. Please open an issue "
+                             "`num_hidden_layers` = {}. Please open an issue "
-                             "to discuss this feature." % (
+                             "to discuss this feature.".format(
                                 config.max_window_layers,
                                 config.num_hidden_layers,
                             ))
--- a/Show More
+++ b/Show More