From 2c4f59afc3d50fda805c4ad94c9d9be168cded0b Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 29 Apr 2025 19:08:04 -0700 Subject: [PATCH] Update PyTorch to 2.7.0 (#16859) --- .buildkite/release-pipeline.yaml | 10 ++-- .buildkite/scripts/upload-wheels.sh | 18 ++++---- .buildkite/test-pipeline.yaml | 2 +- .github/workflows/lint-and-deploy.yaml | 4 +- .pre-commit-config.yaml | 2 +- CMakeLists.txt | 4 +- docker/Dockerfile | 46 +++++++++++++------ .../installation/gpu/cuda.inc.md | 6 +-- .../online_serving/chart-helm/values.yaml | 2 +- pyproject.toml | 2 +- requirements/build.txt | 2 +- requirements/cpu.txt | 11 +++-- requirements/cuda.txt | 9 ++-- requirements/rocm-build.txt | 6 +-- requirements/test.in | 6 +-- requirements/test.txt | 44 ++++++++++-------- setup.py | 2 +- vllm/attention/ops/ipex_attn.py | 3 +- 18 files changed, 102 insertions(+), 77 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 642c0259c7893..03e2267a1b4ea 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,20 +1,20 @@ steps: - - label: "Build wheel - CUDA 12.4" + - label: "Build wheel - CUDA 12.8" agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" - - label: "Build wheel - CUDA 12.1" + - label: "Build wheel - CUDA 12.6" agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" @@ -48,7 +48,7 @@ steps: queue: cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - label: "Build and publish TPU release image" diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index a681f89270600..75e3ef2640956 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" if [[ $normal_wheel == *"cu118"* ]]; then # if $normal_wheel matches cu118, do not upload the index.html echo "Skipping index files for cu118 wheels" -elif [[ $normal_wheel == *"cu121"* ]]; then - # if $normal_wheel matches cu121, do not upload the index.html - echo "Skipping index files for cu121 wheels" +elif [[ $normal_wheel == *"cu126"* ]]; then + # if $normal_wheel matches cu126, do not upload the index.html + echo "Skipping index files for cu126 wheels" else - # only upload index.html for cu124 wheels (default wheels) + # only upload index.html for cu128 wheels (default wheels) aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" fi @@ -66,12 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" if [[ $normal_wheel == *"cu118"* ]]; then # if $normal_wheel matches cu118, do not upload the index.html echo "Skipping index files for cu118 wheels" -elif [[ $normal_wheel == *"cu121"* ]]; then - # if $normal_wheel matches cu121, do not upload the index.html - echo "Skipping index files for cu121 wheels" +elif [[ $normal_wheel == *"cu126"* ]]; then + # if $normal_wheel matches cu126, do not upload the index.html + echo "Skipping index files for cu126 wheels" else - # only upload index.html for cu124 wheels (default wheels) + # only upload index.html for cu128 wheels (default wheels) aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" fi -aws s3 cp "$wheel" "s3://vllm-wheels/$version/" \ No newline at end of file +aws s3 cp "$wheel" "s3://vllm-wheels/$version/" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fc0eb3d9f0be4..8da43322c5cad 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -455,7 +455,7 @@ steps: - tests/models/encoder_decoder/language commands: # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - - pip install causal-conv1d + - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - pytest -v -s models/embedding/language -m core_model diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml index 7b1d9f69938c8..dd9b61a647345 100644 --- a/.github/workflows/lint-and-deploy.yaml +++ b/.github/workflows/lint-and-deploy.yaml @@ -66,7 +66,7 @@ jobs: export AWS_SECRET_ACCESS_KEY=minioadmin sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" & helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env" - + - name: curl test run: | kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 & @@ -79,4 +79,4 @@ jobs: "max_tokens": 7, "temperature": 0 }'):$CODE" - echo "$CODE" \ No newline at end of file + echo "$CODE" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 87681d7eb960c..90ed492d992a7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -46,7 +46,7 @@ repos: rev: 0.6.17 hooks: - id: pip-compile - args: [requirements/test.in, -o, requirements/test.txt] + args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match] files: ^requirements/test\.(in|txt)$ - repo: local hooks: diff --git a/CMakeLists.txt b/CMakeLists.txt index 3314f05fd2a07..6be9adcb8252d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from docker/Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0") -set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0") +set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0") # # Try to find python package with an executable that exactly matches diff --git a/docker/Dockerfile b/docker/Dockerfile index 7d1fac9db245d..17adb7a92dc19 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -5,11 +5,11 @@ # docs/source/contributing/dockerfile/dockerfile.md and # docs/source/assets/contributing/dockerfile-stages-dependency.png -ARG CUDA_VERSION=12.4.1 +ARG CUDA_VERSION=12.8.1 #################### BASE BUILD IMAGE #################### # prepare basic build environment FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base -ARG CUDA_VERSION=12.4.1 +ARG CUDA_VERSION=12.8.1 ARG PYTHON_VERSION=3.12 ARG TARGETPLATFORM ENV DEBIAN_FRONTEND=noninteractive @@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # as it was causing spam when compiling the CUTLASS kernels @@ -69,7 +70,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ COPY requirements/common.txt requirements/common.txt COPY requirements/cuda.txt requirements/cuda.txt RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/cuda.txt + uv pip install --system -r requirements/cuda.txt \ + --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # cuda arch list used by torch # can be useful for both `dev` and `test` @@ -92,9 +94,11 @@ COPY requirements/build.txt requirements/build.txt # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/build.txt + uv pip install --system -r requirements/build.txt \ + --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') COPY . . ARG GIT_REPO_CHECK=0 @@ -161,22 +165,25 @@ FROM base as dev # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" + +# Workaround for #17068 +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" COPY requirements/lint.txt requirements/lint.txt COPY requirements/test.txt requirements/test.txt COPY requirements/dev.txt requirements/dev.txt -# Workaround for #17068 RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system mamba-ssm==2.2.4 --no-build-isolation -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/dev.txt + uv pip install --system -r requirements/dev.txt \ + --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') #################### DEV IMAGE #################### #################### vLLM installation IMAGE #################### # image with vLLM installed # TODO: Restore to base image after FlashInfer AOT wheel fixed FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base -ARG CUDA_VERSION=12.4.1 +ARG CUDA_VERSION=12.8.1 ARG PYTHON_VERSION=3.12 WORKDIR /vllm-workspace ENV DEBIAN_FRONTEND=noninteractive @@ -209,6 +216,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" # Workaround for https://github.com/openai/triton/issues/2507 and # https://github.com/pytorch/pytorch/issues/107960 -- hopefully @@ -229,7 +237,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system dist/*.whl --verbose + uv pip install --system dist/*.whl --verbose \ + --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # If we need to build FlashInfer wheel before its release: # $ export FLASHINFER_ENABLE_AOT=1 @@ -246,19 +255,26 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ - uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ + # TESTING: install FlashInfer from source to test 2.7.0 final RC + FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \ + uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \ fi COPY examples examples COPY benchmarks benchmarks COPY ./vllm/collect_env.py . +RUN --mount=type=cache,target=/root/.cache/uv \ +. /etc/environment && \ +uv pip list + # Although we build Flashinfer with AOT mode, there's still # some issues w.r.t. JIT compilation. Therefore we need to # install build dependencies for JIT compilation. # TODO: Remove this once FlashInfer AOT wheel is fixed COPY requirements/build.txt requirements/build.txt RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/build.txt + uv pip install --system -r requirements/build.txt \ + --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') #################### vLLM installation IMAGE #################### @@ -272,11 +288,13 @@ ADD . /vllm-workspace/ # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # Reference: https://github.com/astral-sh/uv/pull/1694 ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" -# install development dependencies (for testing) # Workaround for #17068 RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system mamba-ssm==2.2.4 --no-build-isolation + uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" + +# install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -r requirements/dev.txt diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md index 46bdb08ebb77c..06915f09dd517 100644 --- a/docs/source/getting_started/installation/gpu/cuda.inc.md +++ b/docs/source/getting_started/installation/gpu/cuda.inc.md @@ -1,6 +1,6 @@ # Installation -vLLM contains pre-compiled C++ and CUDA (12.1) binaries. +vLLM contains pre-compiled C++ and CUDA (12.6) binaries. ## Requirements @@ -23,12 +23,12 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I You can install vLLM using either `pip` or `uv pip`: ```console -# Install vLLM with CUDA 12.4. +# Install vLLM with CUDA 12.6. pip install vllm # If you are using pip. uv pip install vllm # If you are using uv. ``` -As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.1, 11.8, and public PyTorch release versions: +As of now, vLLM's binaries are compiled with CUDA 12.6 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 11.8, and public PyTorch release versions: ```console # Install vLLM with CUDA 11.8. diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/online_serving/chart-helm/values.yaml index 9c48e7d061bf7..28dba9a6f6882 100644 --- a/examples/online_serving/chart-helm/values.yaml +++ b/examples/online_serving/chart-helm/values.yaml @@ -8,7 +8,7 @@ image: # -- Image tag tag: "latest" # -- Container launch command - command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"] + command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"] # -- Container port containerPort: 8000 diff --git a/pyproject.toml b/pyproject.toml index b5f1039b44dac..c85e85b0c82b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.6.0", + "torch == 2.7.0", "wheel", "jinja2", ] diff --git a/requirements/build.txt b/requirements/build.txt index 13d643bcaff10..19d757b452ac1 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -4,6 +4,6 @@ ninja packaging setuptools>=61 setuptools-scm>=8 -torch==2.6.0 +torch==2.7.0 wheel jinja2>=3.1.6 diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 69f732c2417a1..752931158a056 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -2,18 +2,19 @@ -r common.txt # Dependencies for CPUs -torch==2.6.0+cpu; platform_machine == "x86_64" -torch==2.6.0; platform_system == "Darwin" -torch==2.6.0; platform_machine == "ppc64le" or platform_machine == "aarch64" +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.7.0+cpu; platform_machine == "x86_64" +torch==2.7.0; platform_system == "Darwin" +torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64" torch==2.7.0.dev20250304; platform_machine == "s390x" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" -torchaudio==2.6.0; platform_machine == "ppc64le" +torchaudio==2.7.0; platform_machine == "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch torchvision; platform_machine != "ppc64le" and platform_machine != "s390x" -torchvision==0.21.0; platform_machine == "ppc64le" +torchvision==0.22.0; platform_machine == "ppc64le" datasets # for benchmark scripts # cpu cannot use triton 3.3.0 diff --git a/requirements/cuda.txt b/requirements/cuda.txt index cdc6ee75afbcd..a71d9728f38ad 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -6,8 +6,9 @@ numba == 0.61.2; python_version > '3.9' # Dependencies for NVIDIA GPUs ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1. -torch==2.6.0 -torchaudio==2.6.0 +torch==2.7.0 +torchaudio==2.7.0 # These must be updated alongside torch -torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.6.0 +torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +# https://github.com/facebookresearch/xformers/releases/tag/v0.0.30 +xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index 05de4ff168453..55ffe82e81779 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -2,9 +2,9 @@ -r common.txt --extra-index-url https://download.pytorch.org/whl/rocm6.2.4 -torch==2.6.0 -torchvision==0.21.0 -torchaudio==2.6.0 +torch==2.7.0 +torchvision==0.22.0 +torchaudio==2.7.0 triton==3.2 cmake>=3.26,<4 diff --git a/requirements/test.in b/requirements/test.in index c5d2c4cd4c30f..ee79aae58d7e6 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -23,9 +23,9 @@ sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests timm # required for internvl test -torch==2.6.0 -torchaudio==2.6.0 -torchvision==0.21.0 +torch==2.7.0 +torchaudio==2.7.0 +torchvision==0.22.0 transformers_stream_generator # required for qwen-vl test mamba_ssm # required for plamo2 test matplotlib # required for qwen-vl test diff --git a/requirements/test.txt b/requirements/test.txt index 9642a5bfe68d4..2e8121e3882eb 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements/test.in -o requirements/test.txt +# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match absl-py==2.1.0 # via rouge-score accelerate==1.0.1 @@ -274,7 +274,7 @@ mamba-ssm==2.2.4 # via -r requirements/test.in markdown-it-py==3.0.0 # via rich -markupsafe==3.0.2 +markupsafe==3.0.1 # via # jinja2 # werkzeug @@ -355,39 +355,42 @@ numpy==1.26.4 # transformers # tritonclient # vocos -nvidia-cublas-cu12==12.4.5.8 +nvidia-cublas-cu12==12.6.4.1 # via # nvidia-cudnn-cu12 # nvidia-cusolver-cu12 # torch -nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-cupti-cu12==12.6.80 # via torch -nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.6.77 # via torch -nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.6.77 # via torch -nvidia-cudnn-cu12==9.1.0.70 +nvidia-cudnn-cu12==9.5.1.17 # via torch -nvidia-cufft-cu12==11.2.1.3 +nvidia-cufft-cu12==11.3.0.4 # via torch -nvidia-curand-cu12==10.3.5.147 +nvidia-cufile-cu12==1.11.1.6 # via torch -nvidia-cusolver-cu12==11.6.1.9 +nvidia-curand-cu12==10.3.7.77 # via torch -nvidia-cusparse-cu12==12.3.1.170 +nvidia-cusolver-cu12==11.7.1.2 + # via torch +nvidia-cusparse-cu12==12.5.4.2 # via # nvidia-cusolver-cu12 # torch -nvidia-cusparselt-cu12==0.6.2 +nvidia-cusparselt-cu12==0.6.3 # via torch -nvidia-nccl-cu12==2.21.5 +nvidia-nccl-cu12==2.26.2 # via torch -nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvjitlink-cu12==12.6.85 # via + # nvidia-cufft-cu12 # nvidia-cusolver-cu12 # nvidia-cusparse-cu12 # torch -nvidia-nvtx-cu12==12.4.127 +nvidia-nvtx-cu12==12.6.77 # via torch opencv-python-headless==4.11.0.86 # via @@ -634,6 +637,7 @@ setuptools==75.8.0 # mamba-ssm # pytablewriter # torch + # triton shellingham==1.5.4 # via typer six==1.16.0 @@ -664,7 +668,7 @@ starlette-testclient==0.4.1 # via schemathesis statsmodels==0.14.4 # via genai-perf -sympy==1.13.1 +sympy==1.13.3 # via # einx # torch @@ -696,7 +700,7 @@ tomli==2.2.1 # via schemathesis tomli-w==1.2.0 # via schemathesis -torch==2.6.0 +torch==2.7.0 # via # -r requirements/test.in # accelerate @@ -714,12 +718,12 @@ torch==2.6.0 # torchvision # vector-quantize-pytorch # vocos -torchaudio==2.6.0 +torchaudio==2.7.0 # via # -r requirements/test.in # encodec # vocos -torchvision==0.21.0 +torchvision==0.22.0 # via # -r requirements/test.in # timm @@ -748,7 +752,7 @@ transformers==4.51.3 # transformers-stream-generator transformers-stream-generator==0.0.5 # via -r requirements/test.in -triton==3.2.0 +triton==3.3.0 # via torch tritonclient==2.51.0 # via diff --git a/setup.py b/setup.py index a1867960e5930..7675fbdf3efec 100755 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None # fallback to cpu VLLM_TARGET_DEVICE = "cpu" -MAIN_CUDA_VERSION = "12.4" +MAIN_CUDA_VERSION = "12.8" def is_sccache_available() -> bool: diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index 6d96f58320c84..1702203b18346 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -5,7 +5,8 @@ from typing import Dict, List, Optional, Tuple try: import intel_extension_for_pytorch.llm.modules as ipex_modules _use_ipex = True -except ImportError: +# AttributeError is to handle a bug in ipex https://github.com/intel/intel-extension-for-pytorch/pull/813 +except (ImportError, AttributeError): _use_ipex = False import torch