Revert #29787 and #29690 (#29815)

2026-03-20 01:47:52 +08:00 · 2025-12-01 13:42:03 -08:00 · 2025-12-01 13:42:03 -08:00 · 1336a1ea24
commit 1336a1ea24
parent eaf81485ed
9 changed files with 185 additions and 580 deletions
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import os
+
+template = """<!DOCTYPE html>
+<html>
+    <body>
+    <h1>Links for vLLM</h1/>
+        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
+        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
+    </body>
+</html>
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--wheel", help="The wheel path.", required=True)
+args = parser.parse_args()
+
+filename = os.path.basename(args.wheel)
+
+with open("index.html", "w") as f:
+    print(f"Generated index.html for {args.wheel}")
+    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
+    if "x86_64" in filename:
+        x86_wheel = filename
+        arm_wheel = filename.replace("x86_64", "aarch64").replace(
+            "manylinux1", "manylinux2014"
+        )
+    elif "aarch64" in filename:
+        x86_wheel = filename.replace("aarch64", "x86_64").replace(
+            "manylinux2014", "manylinux1"
+        )
+        arm_wheel = filename
+    else:
+        raise ValueError(f"Unsupported wheel: {filename}")
+    # cloudfront requires escaping the '+' character
+    f.write(
+        template.format(
+            x86_wheel=x86_wheel,
+            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
+            arm_wheel=arm_wheel,
+            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
+        )
+    )
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -8,7 +8,7 @@ steps:
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
@ -30,6 +30,19 @@ steps:
      DOCKER_BUILDKIT: "1"

  # x86 + CUDA builds
+  - label: "Build wheel - CUDA 12.8"
+    depends_on: ~
+    id: build-wheel-cuda-12-8
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
  - label: "Build wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-cuda-12-9
@ -96,6 +109,7 @@ steps:
  - label: "Annotate release workflow"
    depends_on:
      - create-multi-arch-manifest
+      - build-wheel-cuda-12-8
    id: annotate-release-workflow
    agents:
      queue: cpu_queue_postmerge
--- a/.buildkite/scripts/generate-nightly-index.py
+++ b/.buildkite/scripts/generate-nightly-index.py
@ -1,369 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# do not complain about line length (for docstring)
-# ruff: noqa: E501
-
-import argparse
-import json
-import re
-import sys
-from dataclasses import asdict, dataclass
-from pathlib import Path
-from typing import Any
-from urllib.parse import quote
-
-if not sys.version_info >= (3, 10):
-    raise RuntimeError("This script requires Python 3.10 or higher.")
-
-INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
-<html>
-  <meta name="pypi:repository-version" content="1.0">
-  <body>
-{items}
-  </body>
-</html>
-"""
-
-
-@dataclass
-class WheelFileInfo:
-    package_name: str
-    version: str
-    build_tag: str | None
-    python_tag: str
-    abi_tag: str
-    platform_tag: str
-    variant: str | None
-    filename: str
-
-
-def parse_from_filename(file: str) -> WheelFileInfo:
-    """
-    Parse wheel file name to extract metadata.
-
-    The format of wheel names:
-        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
-    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
-    Example:
-        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
-        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
-        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
-        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
-    """
-    wheel_file_re = re.compile(
-        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
-    )
-    match = wheel_file_re.match(file)
-    if not match:
-        raise ValueError(f"Invalid wheel file name: {file}")
-
-    package_name = match.group("package_name")
-    version = match.group("version")
-    build_tag = match.group("build_tag")
-    python_tag = match.group("python_tag")
-    abi_tag = match.group("abi_tag")
-    platform_tag = match.group("platform_tag")
-
-    # extract variant from version
-    variant = None
-    if "dev" in version:
-        ver_after_dev = version.split("dev")[-1]
-        if "." in ver_after_dev:
-            variant = ver_after_dev.split(".")[-1]
-            version = version.removesuffix("." + variant)
-    else:
-        if "+" in version:
-            version, variant = version.split("+")
-
-    return WheelFileInfo(
-        package_name=package_name,
-        version=version,
-        build_tag=build_tag,
-        python_tag=python_tag,
-        abi_tag=abi_tag,
-        platform_tag=platform_tag,
-        variant=variant,
-        filename=file,
-    )
-
-
-def generate_project_list(subdir_names: list[str]) -> str:
-    """
-    Generate project list HTML content linking to each project & variant sub-directory.
-    """
-    href_tags = []
-    for name in sorted(subdir_names):
-        name = name.strip("/").strip(".")
-        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
-    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
-
-
-def generate_package_index_and_metadata(
-    wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
-) -> tuple[str, str]:
-    """
-    Generate package index HTML content for a specific package, linking to actual wheel files.
-    """
-    href_tags = []
-    metadata = []
-    for file in sorted(wheel_files, key=lambda x: x.filename):
-        relative_path = (
-            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
-        )
-        # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
-        # NOTE: this is AWS S3 specific behavior!
-        file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
-        href_tags.append(f'    <a href="{file_path_quoted}">{file.filename}</a><br/>')
-        file_meta = asdict(file)
-        file_meta["path"] = file_path_quoted
-        metadata.append(file_meta)
-    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
-    metadata_str = json.dumps(metadata, indent=2)
-    return index_str, metadata_str
-
-
-def generate_index_and_metadata(
-    whl_files: list[str],
-    wheel_base_dir: Path,
-    index_base_dir: Path,
-    default_variant: str | None = None,
-    alias_to_default: str | None = None,
-):
-    """
-    Generate index for all wheel files.
-
-    Args:
-        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
-        wheel_base_dir (Path): Base directory for wheel files.
-        index_base_dir (Path): Base directory to store index files.
-        default_variant (str | None): The default variant name, if any.
-        alias_to_default (str | None): Alias variant name for the default variant, if any.
-
-    First, parse all wheel files to extract metadata.
-    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
-    The index for the default variant (if any) is generated in the root index directory.
-
-    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
-    is purely a copy of the corresponding variant index, with only the links adjusted.
-    Otherwise, all wheels without variant suffixes are treated as the default variant.
-
-    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
-    as the default variant index, but the links are adjusted accordingly.
-
-    Index directory structure:
-        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
-            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
-            vllm/
-                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
-                metadata.json # machine-readable metadata for all wheels in this package
-            cpu/ # cpu variant sub-directory
-                index.html
-                vllm/
-                    index.html
-                    metadata.json
-            cu129/ # cu129 is actually the alias to default variant
-                index.html
-                vllm/
-                    index.html
-                    metadata.json
-            cu130/ # cu130 variant sub-directory
-                index.html
-                vllm/
-                    index.html
-                    metadata.json
-            ...
-
-    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
-        [
-            {
-                "package_name": "vllm",
-                "version": "0.10.2rc2",
-                "build_tag": null,
-                "python_tag": "cp38",
-                "abi_tag": "abi3",
-                "platform_tag": "manylinux2014_aarch64",
-                "variant": "cu129",
-                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
-                "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
-            },
-            ...
-        ]
-    """
-
-    parsed_files = [parse_from_filename(f) for f in whl_files]
-
-    if not parsed_files:
-        print("No wheel files found, skipping index generation.")
-        return
-
-    # Group by variant
-    variant_to_files: dict[str, list[WheelFileInfo]] = {}
-    for file in parsed_files:
-        variant = file.variant or "default"
-        if variant not in variant_to_files:
-            variant_to_files[variant] = []
-        variant_to_files[variant].append(file)
-
-    print(f"Found variants: {list(variant_to_files.keys())}")
-
-    # sanity check for default variant
-    if default_variant:
-        if "default" in variant_to_files:
-            raise ValueError(
-                "All wheel files must have variant suffixes when `default_variant` is specified."
-            )
-        if default_variant not in variant_to_files:
-            raise ValueError(
-                f"Default variant '{default_variant}' not found among wheel files."
-            )
-
-    if alias_to_default:
-        if "default" not in variant_to_files:
-            # e.g. only some wheels are uploaded to S3 currently
-            print(
-                "[WARN] Alias to default variant specified, but no default variant found."
-            )
-        elif alias_to_default in variant_to_files:
-            raise ValueError(
-                f"Alias variant name '{alias_to_default}' already exists among wheel files."
-            )
-        else:
-            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
-            print(f"Alias variant '{alias_to_default}' created for default variant.")
-
-    # Generate index for each variant
-    subdir_names = set()
-    for variant, files in variant_to_files.items():
-        if variant == "default":
-            variant_dir = index_base_dir
-        else:
-            variant_dir = index_base_dir / variant
-            subdir_names.add(variant)
-
-        variant_dir.mkdir(parents=True, exist_ok=True)
-
-        # gather all package names in this variant
-        packages = set(f.package_name for f in files)
-        if variant == "default":
-            # these packages should also appear in the "project list"
-            # generate after all variants are processed
-            subdir_names = subdir_names.union(packages)
-        else:
-            # generate project list for this variant directly
-            project_list_str = generate_project_list(sorted(packages))
-            with open(variant_dir / "index.html", "w") as f:
-                f.write(project_list_str)
-
-        for package in packages:
-            # filter files belonging to this package only
-            package_files = [f for f in files if f.package_name == package]
-            package_dir = variant_dir / package
-            package_dir.mkdir(parents=True, exist_ok=True)
-            index_str, metadata_str = generate_package_index_and_metadata(
-                package_files, wheel_base_dir, package_dir
-            )
-            with open(package_dir / "index.html", "w") as f:
-                f.write(index_str)
-            with open(package_dir / "metadata.json", "w") as f:
-                f.write(metadata_str)
-
-    # Generate top-level project list index
-    project_list_str = generate_project_list(sorted(subdir_names))
-    with open(index_base_dir / "index.html", "w") as f:
-        f.write(project_list_str)
-
-
-if __name__ == "__main__":
-    """
-    Arguments:
-        --version <version> : version string for the current build (e.g., commit hash)
-        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
-        --output-dir <output_directory> : directory to store generated index files
-        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
-    """
-
-    parser = argparse.ArgumentParser(
-        description="Process nightly build wheel files to generate indices."
-    )
-    parser.add_argument(
-        "--version",
-        type=str,
-        required=True,
-        help="Version string for the current build (e.g., commit hash)",
-    )
-    parser.add_argument(
-        "--current-objects",
-        type=str,
-        required=True,
-        help="Path to JSON file containing current S3 objects listing in this version directory",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        required=True,
-        help="Directory to store generated index files",
-    )
-    parser.add_argument(
-        "--alias-to-default",
-        type=str,
-        default=None,
-        help="Alias variant name for the default variant",
-    )
-
-    args = parser.parse_args()
-
-    version = args.version
-    if "/" in version or "\\" in version:
-        raise ValueError("Version string must not contain slashes.")
-    current_objects_path = Path(args.current_objects)
-    output_dir = Path(args.output_dir)
-    if not output_dir.exists():
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-    # Read current objects JSON
-    with open(current_objects_path) as f:
-        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
-
-    # current_objects looks like from list_objects_v2 S3 API:
-    """
-    "Contents": [
-        {
-            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
-            "LastModified": "2025-11-28T14:00:32+00:00",
-            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
-            "ChecksumAlgorithm": [
-                "CRC64NVME"
-            ],
-            "ChecksumType": "FULL_OBJECT",
-            "Size": 435649349,
-            "StorageClass": "STANDARD"
-        },
-        ...
-    ]
-    """
-
-    # Extract wheel file keys
-    wheel_files = []
-    for item in current_objects.get("Contents", []):
-        key: str = item["Key"]
-        if key.endswith(".whl"):
-            wheel_files.append(key.split("/")[-1])  # only the filename is used
-
-    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
-
-    # Generate index and metadata, assuming wheels and indices are stored as:
-    # s3://vllm-wheels/{version}/<wheel files>
-    # s3://vllm-wheels/<anything>/<index files>
-    wheel_base_dir = Path(output_dir).parent / version
-    index_base_dir = Path(output_dir)
-
-    generate_index_and_metadata(
-        whl_files=wheel_files,
-        wheel_base_dir=wheel_base_dir,
-        index_base_dir=index_base_dir,
-        default_variant=None,
-        alias_to_default=args.alias_to_default,
-    )
-    print(f"Successfully generated index and metadata in {output_dir}")
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -2,28 +2,6 @@

 set -ex

-# ======== part 0: setup ========
-
-BUCKET="vllm-wheels"
-INDICES_OUTPUT_DIR="indices"
-DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
-PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
-SUBPATH=$BUILDKITE_COMMIT
-S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
-
-# detect if python3.10+ is available
-has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,10) else 0)")
-if [[ "$has_new_python" -eq 0 ]]; then
-    # use new python from docker
-    docker pull python:3-slim
-    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
-fi
-
-echo "Using python interpreter: $PYTHON"
-echo "Python version: $($PYTHON --version)"
-
-# ========= part 1: collect, rename & upload the wheel ==========
-
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)

@ -32,69 +10,74 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
  exit 1
 fi
+
+# Get the single wheel file
 wheel="${wheel_files[0]}"

-# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
-# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
-manylinux_version="manylinux_2_31"
+# Detect architecture and rename 'linux' to appropriate manylinux version
+arch=$(uname -m)
+if [[ $arch == "x86_64" ]]; then
+    manylinux_version="manylinux1"
+elif [[ $arch == "aarch64" ]]; then
+    manylinux_version="manylinux2014"
+else
+    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
+    manylinux_version="manylinux1"
+fi

 # Rename 'linux' to the appropriate manylinux version in the wheel filename
-if [[ "$wheel" != *"linux"* ]]; then
-  echo "Error: Wheel filename does not contain 'linux': $wheel"
-  exit 1
-fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
-echo "Renamed wheel to: $wheel"

 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version in wheel: $version"
-pure_version="${version%%+*}"
-echo "Pure version (without variant): $pure_version"
+echo "Version: $version"

-# copy wheel to its own bucket
-aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
+normal_wheel="$wheel" # Save the original wheel filename

-# ========= part 2: generate and upload indices ==========
-# generate indices for all existing wheels in the commit directory
-# this script might be run multiple times if there are multiple variants being built
-# so we need to guarantee there is little chance for "TOCTOU" issues
-# i.e., one process is generating indices while another is uploading a new wheel
-# so we need to ensure no time-consuming operations happen below
+# If the version contains "dev", rename it to v1.0.0.dev for consistency
+if [[ $version == *dev* ]]; then
+    suffix="${version##*.}"
+    if [[ $suffix == cu* ]]; then
+        new_version="1.0.0.dev+${suffix}"
+    else
+        new_version="1.0.0.dev"
+    fi
+    new_wheel="${wheel/$version/$new_version}"
+    # use cp to keep both files in the artifacts directory
+    cp -- "$wheel" "$new_wheel"
+    wheel="$new_wheel"
+    version="$new_version"
+fi

-# list all wheels in the commit directory
-echo "Existing wheels on S3:"
-aws s3 ls "$S3_COMMIT_PREFIX"
-obj_json="objects.json"
-aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
-mkdir -p "$INDICES_OUTPUT_DIR"
+# Upload the wheel to S3
+python3 .buildkite/generate_index.py --wheel "$normal_wheel"

-# call script to generate indicies for all existing wheels
-# this indices have relative paths that could work as long as it is next to the wheel directory in s3
-# i.e., the wheels are always in s3://vllm-wheels/<commit>/
-# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
-if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
-    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
+# generate index for this commit
+aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+
+if [[ $normal_wheel == *"cu129"* ]]; then
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
+    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
+    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 else
-    alias_arg=""
+    echo "Skipping index files for non-cu129 wheels"
 fi

-$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
+# generate index for nightly
+aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"

-# copy indices to /<commit>/ unconditionally
-echo "Uploading indices to $S3_COMMIT_PREFIX"
-aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
-
-# copy to /nightly/ only if it is on the main branch and not a PR 
-if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
-    echo "Uploading indices to overwrite /nightly/"
-    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
+if [[ $normal_wheel == *"cu129"* ]]; then
+    # only upload index.html for cu129 wheels (default wheels) as it
+    # is available on both x86 and arm64
+    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+else
+    echo "Skipping index files for non-cu129 wheels"
 fi

-# copy to /<pure_version>/ only if it does not have "dev" in the version
-if [[ "$version" != *"dev"* ]]; then
-    echo "Uploading indices to overwrite /$pure_version/"
-    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
-fi
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
+aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@ -46,23 +46,10 @@ vLLM is a Python library that supports the following CPU variants. Select your C

 ### Pre-built wheels

-Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
-
-When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
-For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
+Currently, there are no pre-built CPU wheels.

 ### Build wheel from source

-#### Set up using Python-only build (without compilation) {#python-only-build}
-
-Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
-
-```bash
-VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
-```
-
-#### Full build (with compilation) {#full-build}
-
 === "Intel/AMD x86"

    --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@ -26,50 +26,43 @@ uv pip install vllm --torch-backend=auto

 ??? console "pip"
    ```bash
-    # Install vLLM with CUDA 12.9.
-    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
+    # Install vLLM with CUDA 12.8.
+    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
    ```

-We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
+We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.

 !!! note
    NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.

-As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:

 ```bash
-# Install vLLM with a specific CUDA version (e.g., 13.0).
+# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-export CUDA_VERSION=130 # or other
-uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
+export CUDA_VERSION=118 # or 126
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```

 #### Install the latest code

-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on <https://wheels.vllm.ai/nightly>. There are multiple indices that could be used:
-
-* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
-* `https://wheels.vllm.ai/nightly/<variant>`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
-
-To install from nightly index, run:
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.

 ```bash
 uv pip install -U vllm \
    --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
+    --extra-index-url https://wheels.vllm.ai/nightly
 ```

-!!! warning "`pip` caveat"
-
-    Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
-
-    If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
-
+??? console "pip"
    ```bash
-    pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
-    pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
+    pip install -U vllm \
+        --pre \
+        --extra-index-url https://wheels.vllm.ai/nightly
    ```

+    `--pre` is required for `pip` to consider pre-released versions.
+
 ##### Install specific revisions

 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
@ -78,13 +71,33 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 uv pip install vllm \
    --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
+    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
 ```

+The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
+
+??? note "pip"
+    If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
+    performance regression), due to the limitation of `pip`, you have to specify the full URL of the
+    wheel file by embedding the commit hash in the URL:
+
+    ```bash
+    export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+    pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+    ```
+
+    Note that the wheels are built with Python 3.8 ABI (see [PEP
+    425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
+    with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
+    placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
+    the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
+    don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
+    wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
+
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

-#### Set up using Python-only build (without compilation) {#python-only-build}
+#### Set up using Python-only build (without compilation)

 If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:

@ -108,24 +121,18 @@ This command will do the following:
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.

 ```bash
-export VLLM_PRECOMPILED_WHEEL_COMIMT=$(git rev-parse HEAD~1) # or earlier commit on main
-export VLLM_USE_PRECOMPILED=1
+export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 uv pip install --editable .
 ```

-There are more environment variables to control the behavior of Python-only build:
-
-* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
-* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
-* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
-
 You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).

 !!! note
    There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.

-#### Full build (with compilation) {#full-build}
+#### Full build (with compilation)

 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:

--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G

    --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"

-### Pre-built wheels {#pre-built-wheels}
+### Pre-built wheels

 === "NVIDIA CUDA"

--- a/setup.py
+++ b/setup.py
@ -310,6 +310,9 @@ class cmake_build_ext(build_ext):
 class precompiled_build_ext(build_ext):
    """Disables extension building when using precompiled binaries."""

+    def run(self) -> None:
+        assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+
    def build_extensions(self) -> None:
        print("Skipping build_ext: using precompiled extensions.")
        return
@ -319,17 +322,14 @@ class precompiled_wheel_utils:
    """Extracts libraries and other files from an existing wheel."""

    @staticmethod
-    def extract_precompiled_and_patch_package(
-        wheel_url_or_path: str, download_filename: str | None
-    ) -> dict:
+    def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict:
        import tempfile
        import zipfile

        temp_dir = None
        try:
            if not os.path.isfile(wheel_url_or_path):
-                # use provided filename first, then derive from URL
-                wheel_filename = download_filename or wheel_url_or_path.split("/")[-1]
+                wheel_filename = wheel_url_or_path.split("/")[-1]
                temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
                wheel_path = os.path.join(temp_dir, wheel_filename)
                print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}")
@ -648,102 +648,38 @@ package_data = {
    ]
 }

-
-def _fetch_metadata_for_variant(
-    commit: str, variant: str | None
-) -> tuple[list[dict], str]:
-    variant_dir = f"{variant}/" if variant is not None else ""
-    repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/"
-    meta_url = repo_url + "metadata.json"
-    logger.info("Trying to fetch metadata from {}", meta_url)
-    from urllib.request import urlopen
-
-    with urlopen(meta_url) as resp:
-        # urlopen raises HTTPError on unexpected status code
-        wheels = json.loads(resp.read().decode("utf-8"))
-    return wheels, repo_url
-
-
 # If using precompiled, extract and patch package_data (in advance of setup)
 if envs.VLLM_USE_PRECOMPILED:
-    # Attempts:
-    # 1. user-specified wheel location (can be either local or remote, via
-    #    VLLM_PRECOMPILED_WHEEL_LOCATION)
-    # 2. user-specified variant from nightly repo (current main commit via
-    #    VLLM_PRECOMPILED_WHEEL_VARIANT)
-    # 3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
-    # 4. the default variant from nightly repo (current main commit)
+    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
    wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
    if wheel_location is not None:
        wheel_url = wheel_location
-        download_filename = None
-        logger.info("Using user-specified precompiled wheel location: %s", wheel_url)
    else:
        import platform

        arch = platform.machine()
-        # try to fetch the wheel metadata from the nightly wheel repo
-        main_variant = envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
-        variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
-        commit = os.getenv(
-            "VLLM_PRECOMPILED_WHEEL_COMMIT",
-            precompiled_wheel_utils.get_base_commit_in_main_branch(),
-        )
-        logger.info(
-            "Using precompiled wheel commit %s with variant %s", commit, variant
-        )
-        try_default = False
-        wheels, repo_url, download_filename = None, None, None
-        try:
-            wheels, repo_url = _fetch_metadata_for_variant(commit, variant)
-        except Exception:
-            logger.warning(
-                "Failed to fetch precompiled wheel metadata for variant %s",
-                variant,
-                exc_info=True,
-            )
-            try_default = True  # try outside handler to keep the stacktrace simple
-        if try_default:
-            logger.info("Trying the default variant")
-            wheels, repo_url = _fetch_metadata_for_variant(commit, None)
-            # if this also fails, then we have nothing more to try / cache
-        assert wheels is not None and repo_url is not None, (
-            "Failed to fetch precompiled wheel metadata"
-        )
-        # The metadata.json has the following format:
-        # see .buildkite/scripts/generate-nightly-index.py for details
-        """[{
-"package_name": "vllm",
-"version": "0.11.2.dev278+gdbc3d9991",
-"build_tag": null,
-"python_tag": "cp38",
-"abi_tag": "abi3",
-"platform_tag": "manylinux1_x86_64",
-"variant": null,
-"filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
-"path": "../vllm-0.11.2.dev278%2Bgdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
-},
-...]"""
-        for wheel in wheels:
-            # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
-            if wheel.get("package_name") == "vllm" and arch in wheel.get(
-                "platform_tag", ""
-            ):
-                logger.info("Found precompiled wheel metadata: %s", wheel)
-                if "path" not in wheel:
-                    raise ValueError(f"Wheel metadata missing path: {wheel}")
-                wheel_url = repo_url + wheel["path"]
-                download_filename = wheel.get("filename")
-                logger.info("Using precompiled wheel URL: %s", wheel_url)
-                break
+        if arch == "x86_64":
+            wheel_tag = "manylinux1_x86_64"
+        elif arch == "aarch64":
+            wheel_tag = "manylinux2014_aarch64"
        else:
-            raise ValueError(
-                f"No precompiled vllm wheel found for architecture {arch} "
-                f"from repo {repo_url}. All available wheels: {wheels}"
-            )
-    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
-        wheel_url, download_filename
-    )
+            raise ValueError(f"Unsupported architecture: {arch}")
+        base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
+        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        nightly_wheel_url = (
+            f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        )
+        from urllib.request import urlopen
+
+        try:
+            with urlopen(wheel_url) as resp:
+                if resp.status != 200:
+                    wheel_url = nightly_wheel_url
+        except Exception as e:
+            print(f"[warn] Falling back to nightly wheel: {e}")
+            wheel_url = nightly_wheel_url
+
+    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
    for pkg, files in patch.items():
        package_data.setdefault(pkg, []).extend(files)

--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -74,7 +74,7 @@ if TYPE_CHECKING:
    VLLM_MEDIA_CONNECTOR: str = "http"
    VLLM_MM_INPUT_CACHE_GIB: int = 4
    VLLM_TARGET_DEVICE: str = "cuda"
-    VLLM_MAIN_CUDA_VERSION: str = "12.9"
+    VLLM_MAIN_CUDA_VERSION: str = "12.8"
    MAX_JOBS: str | None = None
    NVCC_THREADS: str | None = None
    VLLM_USE_PRECOMPILED: bool = False
@ -445,9 +445,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Target device of vLLM, supporting [cuda (by default),
    # rocm, cpu]
    "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
-    # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
+    # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
+    # 12.8 is the default. This follows PyTorch but can be overridden.
    "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
-    or "12.9",
+    or "12.8",
    # Maximum number of compilation jobs to run in parallel.
    # By default this is the number of CPUs
    "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),