diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 2477e60da4c16..c04bfac6644ca 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -197,68 +197,3 @@ steps: DOCKER_BUILDKIT: "1" DOCKERHUB_USERNAME: "vllmbot" - # Build nightly torch Docker images (x86) - - label: "Build nightly torch image (x86)" - depends_on: ~ - id: build-nightly-torch-image-x86 - if: build.env("NIGHTLY") == "1" - agents: - queue: cpu_queue_postmerge - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg USE_TORCH_NIGHTLY=true --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-torch-nightly-x86_64 --target vllm-openai --progress plain -f docker/Dockerfile ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-torch-nightly-x86_64" - - # Build nightly torch Docker images (arm64) - - label: "Build nightly torch image (arm64)" - depends_on: ~ - id: build-nightly-torch-image-arm64 - if: build.env("NIGHTLY") == "1" - agents: - queue: arm64_cpu_queue_postmerge - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg USE_TORCH_NIGHTLY=true --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-torch-nightly-aarch64 --target vllm-openai --progress plain -f docker/Dockerfile ." - - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-torch-nightly-aarch64" - - # Create multi-arch manifest for nightly torch images - - label: "Create nightly torch multi-arch manifest" - depends_on: - - build-nightly-torch-image-x86 - - build-nightly-torch-image-arm64 - id: create-nightly-torch-multi-arch-manifest - if: build.env("NIGHTLY") == "1" - agents: - queue: cpu_queue_postmerge - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-torch-nightly public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-torch-nightly-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-torch-nightly-aarch64 --amend" - - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-torch-nightly" - - # Publish nightly torch images to DockerHub - - label: "Publish nightly torch images to DockerHub" - depends_on: - - create-nightly-torch-multi-arch-manifest - if: build.env("NIGHTLY") == "1" - agents: - queue: cpu_queue_postmerge - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-torch-nightly-x86_64" - - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-torch-nightly-aarch64" - - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-torch-nightly-x86_64 vllm/vllm-openai:torch-nightly-x86_64" - - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-torch-nightly-aarch64 vllm/vllm-openai:torch-nightly-aarch64" - - "docker push vllm/vllm-openai:torch-nightly-x86_64" - - "docker push vllm/vllm-openai:torch-nightly-aarch64" - - "docker manifest create vllm/vllm-openai:torch-nightly vllm/vllm-openai:torch-nightly-x86_64 vllm/vllm-openai:torch-nightly-aarch64 --amend" - - "docker manifest create vllm/vllm-openai:torch-nightly-$BUILDKITE_COMMIT vllm/vllm-openai:torch-nightly-x86_64 vllm/vllm-openai:torch-nightly-aarch64 --amend" - - "docker manifest push vllm/vllm-openai:torch-nightly" - - "docker manifest push vllm/vllm-openai:torch-nightly-$BUILDKITE_COMMIT" - plugins: - - docker-login#v3.0.0: - username: vllmbot - password-env: DOCKERHUB_TOKEN - env: - DOCKER_BUILDKIT: "1" - DOCKERHUB_USERNAME: "vllmbot" - diff --git a/docker/Dockerfile b/docker/Dockerfile index 6734a75ad8143..cdfca180c809d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -157,8 +157,13 @@ WORKDIR /workspace # install build and runtime dependencies COPY requirements/common.txt requirements/common.txt COPY requirements/cuda.txt requirements/cuda.txt +COPY use_existing_torch.py use_existing_torch.py RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ + if [ "$USE_TORCH_NIGHTLY" = "true" ]; then \ + echo ">>> Running use_existing_torch.py to reset torch dependencies for nightly build" && \ + python3 use_existing_torch.py; \ + fi && \ uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \ --extra-index-url ${PYTORCH_INDEX} ${PRERELEASE_FLAG} @@ -178,6 +183,7 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL # install build dependencies COPY requirements/build.txt requirements/build.txt +COPY use_existing_torch.py use_existing_torch.py # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 @@ -188,6 +194,10 @@ ENV UV_LINK_MODE=copy RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ + if [ "$USE_TORCH_NIGHTLY" = "true" ]; then \ + echo ">>> Running use_existing_torch.py to reset torch dependencies for nightly build" && \ + python3 use_existing_torch.py; \ + fi && \ uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \ --extra-index-url ${PYTORCH_INDEX} ${PRERELEASE_FLAG} @@ -311,6 +321,7 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL # install build dependencies COPY requirements/build.txt requirements/build.txt +COPY use_existing_torch.py use_existing_torch.py # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 @@ -321,6 +332,10 @@ ENV UV_LINK_MODE=copy RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ + if [ "$USE_TORCH_NIGHTLY" = "true" ]; then \ + echo ">>> Running use_existing_torch.py to reset torch dependencies for nightly build" && \ + python3 use_existing_torch.py; \ + fi && \ uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \ --extra-index-url ${PYTORCH_INDEX} ${PRERELEASE_FLAG} @@ -501,11 +516,16 @@ ENV PRERELEASE_FLAG="" COPY requirements/common.txt /tmp/common.txt COPY requirements/cuda.txt /tmp/requirements-cuda.txt +COPY use_existing_torch.py /tmp/use_existing_torch.py RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ + if [ "$USE_TORCH_NIGHTLY" = "true" ]; then \ + echo ">>> Running use_existing_torch.py to reset torch dependencies for nightly build" && \ + cd /tmp && python3 use_existing_torch.py; \ + fi && \ uv pip install --system -r /tmp/requirements-cuda.txt \ --extra-index-url ${PYTORCH_INDEX} ${PRERELEASE_FLAG} && \ - rm /tmp/requirements-cuda.txt /tmp/common.txt + rm /tmp/requirements-cuda.txt /tmp/common.txt /tmp/use_existing_torch.py # Install FlashInfer pre-compiled kernel cache and binaries # This is ~1.1GB and only changes when FlashInfer version bumps diff --git a/use_existing_torch.py b/use_existing_torch.py index e2d3f2ec81956..93274de8ce99a 100644 --- a/use_existing_torch.py +++ b/use_existing_torch.py @@ -2,17 +2,85 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob +import os +import re + +# Collect all files to process +files_to_process = list(glob.glob("requirements/*.txt")) + +# Add pyproject.toml if it exists +if os.path.exists("pyproject.toml"): + files_to_process.append("pyproject.toml") + +# Pattern to match torch package names we want to unpin +TORCH_PACKAGES = ['torch', 'torchaudio', 'torchvision', 'triton'] + +def unpin_torch_dependency(line): + """Remove version pinning from torch-related packages, keep the package name.""" + original_line = line + line_stripped = line.strip() + + # Skip empty lines + if not line_stripped: + return line + + # Skip full comment lines + if line_stripped.startswith('#'): + return line + + # Check if this line contains a torch package + for pkg in TORCH_PACKAGES: + # Check if line starts with the package name (case insensitive) + if line_stripped.lower().startswith(pkg): + # Extract inline comment if present + comment = '' + if '#' in line: + pkg_and_version, comment = line.split('#', 1) + comment = ' #' + comment.rstrip('\n') + else: + pkg_and_version = line + + # Check if there's a version specifier + # Matches any version constraint operators: ==, >=, <=, >, <, !=, ~= + if re.search(r'[=<>!~]', pkg_and_version): + # Get original capitalization of package name from the original line + orig_pkg = line_stripped.split()[0] if line_stripped.split() else pkg + # Extract just the package name without any version info + orig_pkg = re.split(r'[=<>!~]', orig_pkg)[0] + + result = f"{orig_pkg}{comment}\n" if comment else f"{orig_pkg}\n" + print(f" unpinned: {line.strip()} -> {result.strip()}") + return result + + return line + +for file in files_to_process: + if not os.path.exists(file): + print(f">>> skipping {file} (does not exist)") + continue -for file in (*glob.glob("requirements/*.txt"), "pyproject.toml"): print(f">>> cleaning {file}") - with open(file) as f: - lines = f.readlines() - if "torch" in "".join(lines).lower(): - print("removed:") - with open(file, "w") as f: - for line in lines: - if "torch" not in line.lower(): - f.write(line) - else: - print(line.strip()) + try: + with open(file) as f: + lines = f.readlines() + except Exception as e: + print(f"!!! error reading {file}: {e}") + continue + + # Check if we need to process this file + has_torch = any(any(pkg in line.lower() for pkg in TORCH_PACKAGES) for line in lines) + + if has_torch: + print("unpinning torch dependencies:") + try: + with open(file, "w") as f: + for line in lines: + new_line = unpin_torch_dependency(line) + f.write(new_line) + except Exception as e: + print(f"!!! error writing {file}: {e}") + continue + else: + print(" (no torch dependencies found)") + print(f"<<< done cleaning {file}\n")