From 2c4f59afc3d50fda805c4ad94c9d9be168cded0b Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Tue, 29 Apr 2025 19:08:04 -0700
Subject: [PATCH] Update PyTorch to 2.7.0 (#16859)

---
 .buildkite/release-pipeline.yaml              | 10 ++--
 .buildkite/scripts/upload-wheels.sh           | 18 ++++----
 .buildkite/test-pipeline.yaml                 |  2 +-
 .github/workflows/lint-and-deploy.yaml        |  4 +-
 .pre-commit-config.yaml                       |  2 +-
 CMakeLists.txt                                |  4 +-
 docker/Dockerfile                             | 46 +++++++++++++------
 .../installation/gpu/cuda.inc.md              |  6 +--
 .../online_serving/chart-helm/values.yaml     |  2 +-
 pyproject.toml                                |  2 +-
 requirements/build.txt                        |  2 +-
 requirements/cpu.txt                          | 11 +++--
 requirements/cuda.txt                         |  9 ++--
 requirements/rocm-build.txt                   |  6 +--
 requirements/test.in                          |  6 +--
 requirements/test.txt                         | 44 ++++++++++--------
 setup.py                                      |  2 +-
 vllm/attention/ops/ipex_attn.py               |  3 +-
 18 files changed, 102 insertions(+), 77 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 642c0259c7893..03e2267a1b4ea 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,20 +1,20 @@
 steps:
-  - label: "Build wheel - CUDA 12.4"
+  - label: "Build wheel - CUDA 12.8"
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
-  - label: "Build wheel - CUDA 12.1"
+  - label: "Build wheel - CUDA 12.6"
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -48,7 +48,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Build and publish TPU release image"
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index a681f89270600..75e3ef2640956 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
     # if $normal_wheel matches cu118, do not upload the index.html
     echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu121 wheels"
+elif [[ $normal_wheel == *"cu126"* ]]; then
+    # if $normal_wheel matches cu126, do not upload the index.html
+    echo "Skipping index files for cu126 wheels"
 else
-    # only upload index.html for cu124 wheels (default wheels)
+    # only upload index.html for cu128 wheels (default wheels)
     aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
     aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@@ -66,12 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
     # if $normal_wheel matches cu118, do not upload the index.html
     echo "Skipping index files for cu118 wheels"
-elif [[ $normal_wheel == *"cu121"* ]]; then
-    # if $normal_wheel matches cu121, do not upload the index.html
-    echo "Skipping index files for cu121 wheels"
+elif [[ $normal_wheel == *"cu126"* ]]; then
+    # if $normal_wheel matches cu126, do not upload the index.html
+    echo "Skipping index files for cu126 wheels"
 else
-    # only upload index.html for cu124 wheels (default wheels)
+    # only upload index.html for cu128 wheels (default wheels)
     aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
 
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
\ No newline at end of file
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index fc0eb3d9f0be4..8da43322c5cad 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -455,7 +455,7 @@ steps:
   - tests/models/encoder_decoder/language
   commands:
     # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install causal-conv1d
+    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
     - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/language -m core_model
 
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 7b1d9f69938c8..dd9b61a647345 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -66,7 +66,7 @@ jobs:
           export AWS_SECRET_ACCESS_KEY=minioadmin
           sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
           helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
-    
+
       - name: curl test
         run: |
           kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
@@ -79,4 +79,4 @@ jobs:
                           "max_tokens": 7,
                           "temperature": 0
                   }'):$CODE"
-          echo "$CODE"
\ No newline at end of file
+          echo "$CODE"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 87681d7eb960c..90ed492d992a7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -46,7 +46,7 @@ repos:
   rev: 0.6.17
   hooks:
     - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match]
       files: ^requirements/test\.(in|txt)$
 - repo: local
   hooks:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3314f05fd2a07..6be9adcb8252d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
 
 #
 # Try to find python package with an executable that exactly matches
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 7d1fac9db245d..17adb7a92dc19 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -5,11 +5,11 @@
 # docs/source/contributing/dockerfile/dockerfile.md and
 # docs/source/assets/contributing/dockerfile-stages-dependency.png
 
-ARG CUDA_VERSION=12.4.1
+ARG CUDA_VERSION=12.8.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
-ARG CUDA_VERSION=12.4.1
+ARG CUDA_VERSION=12.8.1
 ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
@@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
 
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
@@ -69,7 +70,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/cuda.txt
+    uv pip install --system -r requirements/cuda.txt \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -92,9 +94,11 @@ COPY requirements/build.txt requirements/build.txt
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 COPY . .
 ARG GIT_REPO_CHECK=0
@@ -161,22 +165,25 @@ FROM base as dev
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Workaround for #17068
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
 
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
-# Workaround for #17068
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/dev.txt
+    uv pip install --system -r requirements/dev.txt \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 # TODO: Restore to base image after FlashInfer AOT wheel fixed
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
-ARG CUDA_VERSION=12.4.1
+ARG CUDA_VERSION=12.8.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
@@ -209,6 +216,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -229,7 +237,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose
+    uv pip install --system dist/*.whl --verbose \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@@ -246,19 +255,26 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
+    # TESTING: install FlashInfer from source to test 2.7.0 final RC
+    FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \
+    uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \
 fi
 COPY examples examples
 COPY benchmarks benchmarks
 COPY ./vllm/collect_env.py .
 
+RUN --mount=type=cache,target=/root/.cache/uv \
+. /etc/environment && \
+uv pip list
+
 # Although we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
 # install build dependencies for JIT compilation.
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt \
+    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 #################### vLLM installation IMAGE ####################
 
@@ -272,11 +288,13 @@ ADD . /vllm-workspace/
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
 
-# install development dependencies (for testing)
 # Workaround for #17068
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
+
+# install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/dev.txt
 
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index 46bdb08ebb77c..06915f09dd517 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -1,6 +1,6 @@
 # Installation
 
-vLLM contains pre-compiled C++ and CUDA (12.1) binaries.
+vLLM contains pre-compiled C++ and CUDA (12.6) binaries.
 
 ## Requirements
 
@@ -23,12 +23,12 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I
 You can install vLLM using either `pip` or `uv pip`:
 
 ```console
-# Install vLLM with CUDA 12.4.
+# Install vLLM with CUDA 12.6.
 pip install vllm # If you are using pip.
 uv pip install vllm # If you are using uv.
 ```
 
-As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.1, 11.8, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.6 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 11.8, and public PyTorch release versions:
 
 ```console
 # Install vLLM with CUDA 11.8.
diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml
index 9c48e7d061bf7..28dba9a6f6882 100644
--- a/examples/online_serving/chart-helm/values.yaml
+++ b/examples/online_serving/chart-helm/values.yaml
@@ -8,7 +8,7 @@ image:
   # -- Image tag
   tag: "latest"
   # -- Container launch command
-  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"]
+  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
 
 # -- Container port
 containerPort: 8000
diff --git a/pyproject.toml b/pyproject.toml
index b5f1039b44dac..c85e85b0c82b0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "packaging",
     "setuptools>=61",
     "setuptools-scm>=8.0",
-    "torch == 2.6.0",
+    "torch == 2.7.0",
     "wheel",
     "jinja2",
 ]
diff --git a/requirements/build.txt b/requirements/build.txt
index 13d643bcaff10..19d757b452ac1 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.6.0
+torch==2.7.0
 wheel
 jinja2>=3.1.6
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 69f732c2417a1..752931158a056 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -2,18 +2,19 @@
 -r common.txt
 
 # Dependencies for CPUs
-torch==2.6.0+cpu; platform_machine == "x86_64"
-torch==2.6.0; platform_system == "Darwin"
-torch==2.6.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.7.0+cpu; platform_machine == "x86_64"
+torch==2.7.0; platform_system == "Darwin"
+torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 torch==2.7.0.dev20250304; platform_machine == "s390x"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.6.0; platform_machine == "ppc64le"
+torchaudio==2.7.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.21.0; platform_machine == "ppc64le"
+torchvision==0.22.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # cpu cannot use triton 3.3.0
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index cdc6ee75afbcd..a71d9728f38ad 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -6,8 +6,9 @@ numba == 0.61.2; python_version > '3.9'
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.6.0
-torchaudio==2.6.0
+torch==2.7.0
+torchaudio==2.7.0
 # These must be updated alongside torch
-torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.6.0
+torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+# https://github.com/facebookresearch/xformers/releases/tag/v0.0.30
+xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 05de4ff168453..55ffe82e81779 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -2,9 +2,9 @@
 -r common.txt
 
 --extra-index-url https://download.pytorch.org/whl/rocm6.2.4
-torch==2.6.0
-torchvision==0.21.0
-torchaudio==2.6.0
+torch==2.7.0
+torchvision==0.22.0
+torchaudio==2.7.0
 
 triton==3.2
 cmake>=3.26,<4
diff --git a/requirements/test.in b/requirements/test.in
index c5d2c4cd4c30f..ee79aae58d7e6 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -23,9 +23,9 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm # required for internvl test
-torch==2.6.0
-torchaudio==2.6.0
-torchvision==0.21.0
+torch==2.7.0
+torchaudio==2.7.0
+torchvision==0.22.0
 transformers_stream_generator # required for qwen-vl test
 mamba_ssm # required for plamo2 test
 matplotlib # required for qwen-vl test
diff --git a/requirements/test.txt b/requirements/test.txt
index 9642a5bfe68d4..2e8121e3882eb 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile requirements/test.in -o requirements/test.txt
+#    uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match
 absl-py==2.1.0
     # via rouge-score
 accelerate==1.0.1
@@ -274,7 +274,7 @@ mamba-ssm==2.2.4
     # via -r requirements/test.in
 markdown-it-py==3.0.0
     # via rich
-markupsafe==3.0.2
+markupsafe==3.0.1
     # via
     #   jinja2
     #   werkzeug
@@ -355,39 +355,42 @@ numpy==1.26.4
     #   transformers
     #   tritonclient
     #   vocos
-nvidia-cublas-cu12==12.4.5.8
+nvidia-cublas-cu12==12.6.4.1
     # via
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
-nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-cupti-cu12==12.6.80
     # via torch
-nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.6.77
     # via torch
-nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.6.77
     # via torch
-nvidia-cudnn-cu12==9.1.0.70
+nvidia-cudnn-cu12==9.5.1.17
     # via torch
-nvidia-cufft-cu12==11.2.1.3
+nvidia-cufft-cu12==11.3.0.4
     # via torch
-nvidia-curand-cu12==10.3.5.147
+nvidia-cufile-cu12==1.11.1.6
     # via torch
-nvidia-cusolver-cu12==11.6.1.9
+nvidia-curand-cu12==10.3.7.77
     # via torch
-nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusolver-cu12==11.7.1.2
+    # via torch
+nvidia-cusparse-cu12==12.5.4.2
     # via
     #   nvidia-cusolver-cu12
     #   torch
-nvidia-cusparselt-cu12==0.6.2
+nvidia-cusparselt-cu12==0.6.3
     # via torch
-nvidia-nccl-cu12==2.21.5
+nvidia-nccl-cu12==2.26.2
     # via torch
-nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvjitlink-cu12==12.6.85
     # via
+    #   nvidia-cufft-cu12
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
     #   torch
-nvidia-nvtx-cu12==12.4.127
+nvidia-nvtx-cu12==12.6.77
     # via torch
 opencv-python-headless==4.11.0.86
     # via
@@ -634,6 +637,7 @@ setuptools==75.8.0
     #   mamba-ssm
     #   pytablewriter
     #   torch
+    #   triton
 shellingham==1.5.4
     # via typer
 six==1.16.0
@@ -664,7 +668,7 @@ starlette-testclient==0.4.1
     # via schemathesis
 statsmodels==0.14.4
     # via genai-perf
-sympy==1.13.1
+sympy==1.13.3
     # via
     #   einx
     #   torch
@@ -696,7 +700,7 @@ tomli==2.2.1
     # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
-torch==2.6.0
+torch==2.7.0
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -714,12 +718,12 @@ torch==2.6.0
     #   torchvision
     #   vector-quantize-pytorch
     #   vocos
-torchaudio==2.6.0
+torchaudio==2.7.0
     # via
     #   -r requirements/test.in
     #   encodec
     #   vocos
-torchvision==0.21.0
+torchvision==0.22.0
     # via
     #   -r requirements/test.in
     #   timm
@@ -748,7 +752,7 @@ transformers==4.51.3
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
     # via -r requirements/test.in
-triton==3.2.0
+triton==3.3.0
     # via torch
 tritonclient==2.51.0
     # via
diff --git a/setup.py b/setup.py
index a1867960e5930..7675fbdf3efec 100755
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None
     # fallback to cpu
     VLLM_TARGET_DEVICE = "cpu"
 
-MAIN_CUDA_VERSION = "12.4"
+MAIN_CUDA_VERSION = "12.8"
 
 
 def is_sccache_available() -> bool:
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index 6d96f58320c84..1702203b18346 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -5,7 +5,8 @@ from typing import Dict, List, Optional, Tuple
 try:
     import intel_extension_for_pytorch.llm.modules as ipex_modules
     _use_ipex = True
-except ImportError:
+# AttributeError is to handle a bug in ipex https://github.com/intel/intel-extension-for-pytorch/pull/813
+except (ImportError, AttributeError):
     _use_ipex = False
 
 import torch