mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-26 11:57:05 +08:00
Update PyTorch to 2.7.0 (#16859)
This commit is contained in:
parent
1c2bc7ead0
commit
2c4f59afc3
@ -1,20 +1,20 @@
|
|||||||
steps:
|
steps:
|
||||||
- label: "Build wheel - CUDA 12.4"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.1"
|
- label: "Build wheel - CUDA 12.6"
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
@ -48,7 +48,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
- label: "Build and publish TPU release image"
|
||||||
|
|||||||
@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu121"* ]]; then
|
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
# if $normal_wheel matches cu121, do not upload the index.html
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
echo "Skipping index files for cu121 wheels"
|
echo "Skipping index files for cu126 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu124 wheels (default wheels)
|
# only upload index.html for cu128 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
fi
|
fi
|
||||||
@ -66,11 +66,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu121"* ]]; then
|
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
# if $normal_wheel matches cu121, do not upload the index.html
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
echo "Skipping index files for cu121 wheels"
|
echo "Skipping index files for cu126 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu124 wheels (default wheels)
|
# only upload index.html for cu128 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@ -455,7 +455,7 @@ steps:
|
|||||||
- tests/models/encoder_decoder/language
|
- tests/models/encoder_decoder/language
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||||
- pip install causal-conv1d
|
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||||
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||||
- pytest -v -s models/embedding/language -m core_model
|
- pytest -v -s models/embedding/language -m core_model
|
||||||
|
|
||||||
|
|||||||
@ -46,7 +46,7 @@ repos:
|
|||||||
rev: 0.6.17
|
rev: 0.6.17
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt]
|
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
|
|||||||
@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
|
|||||||
@ -5,11 +5,11 @@
|
|||||||
# docs/source/contributing/dockerfile/dockerfile.md and
|
# docs/source/contributing/dockerfile/dockerfile.md and
|
||||||
# docs/source/assets/contributing/dockerfile-stages-dependency.png
|
# docs/source/assets/contributing/dockerfile-stages-dependency.png
|
||||||
|
|
||||||
ARG CUDA_VERSION=12.4.1
|
ARG CUDA_VERSION=12.8.1
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
# prepare basic build environment
|
# prepare basic build environment
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
|
||||||
ARG CUDA_VERSION=12.4.1
|
ARG CUDA_VERSION=12.8.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
|
||||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||||
# as it was causing spam when compiling the CUTLASS kernels
|
# as it was causing spam when compiling the CUTLASS kernels
|
||||||
@ -69,7 +70,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
COPY requirements/common.txt requirements/common.txt
|
COPY requirements/common.txt requirements/common.txt
|
||||||
COPY requirements/cuda.txt requirements/cuda.txt
|
COPY requirements/cuda.txt requirements/cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/cuda.txt
|
uv pip install --system -r requirements/cuda.txt \
|
||||||
|
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
# can be useful for both `dev` and `test`
|
# can be useful for both `dev` and `test`
|
||||||
@ -92,9 +94,11 @@ COPY requirements/build.txt requirements/build.txt
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/build.txt
|
uv pip install --system -r requirements/build.txt \
|
||||||
|
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
ARG GIT_REPO_CHECK=0
|
ARG GIT_REPO_CHECK=0
|
||||||
@ -161,22 +165,25 @@ FROM base as dev
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
|
||||||
|
# Workaround for #17068
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
||||||
|
|
||||||
COPY requirements/lint.txt requirements/lint.txt
|
COPY requirements/lint.txt requirements/lint.txt
|
||||||
COPY requirements/test.txt requirements/test.txt
|
COPY requirements/test.txt requirements/test.txt
|
||||||
COPY requirements/dev.txt requirements/dev.txt
|
COPY requirements/dev.txt requirements/dev.txt
|
||||||
# Workaround for #17068
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
|
uv pip install --system -r requirements/dev.txt \
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
uv pip install --system -r requirements/dev.txt
|
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
# image with vLLM installed
|
# image with vLLM installed
|
||||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
# TODO: Restore to base image after FlashInfer AOT wheel fixed
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
|
||||||
ARG CUDA_VERSION=12.4.1
|
ARG CUDA_VERSION=12.8.1
|
||||||
ARG PYTHON_VERSION=3.12
|
ARG PYTHON_VERSION=3.12
|
||||||
WORKDIR /vllm-workspace
|
WORKDIR /vllm-workspace
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
@ -209,6 +216,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
|
||||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
@ -229,7 +237,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
# Install vllm wheel first, so that torch etc will be installed.
|
# Install vllm wheel first, so that torch etc will be installed.
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/uv \
|
--mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system dist/*.whl --verbose
|
uv pip install --system dist/*.whl --verbose \
|
||||||
|
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
# If we need to build FlashInfer wheel before its release:
|
# If we need to build FlashInfer wheel before its release:
|
||||||
# $ export FLASHINFER_ENABLE_AOT=1
|
# $ export FLASHINFER_ENABLE_AOT=1
|
||||||
@ -246,19 +255,26 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
. /etc/environment && \
|
. /etc/environment && \
|
||||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
|
||||||
uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
|
# TESTING: install FlashInfer from source to test 2.7.0 final RC
|
||||||
|
FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \
|
||||||
|
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \
|
||||||
fi
|
fi
|
||||||
COPY examples examples
|
COPY examples examples
|
||||||
COPY benchmarks benchmarks
|
COPY benchmarks benchmarks
|
||||||
COPY ./vllm/collect_env.py .
|
COPY ./vllm/collect_env.py .
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
. /etc/environment && \
|
||||||
|
uv pip list
|
||||||
|
|
||||||
# Although we build Flashinfer with AOT mode, there's still
|
# Although we build Flashinfer with AOT mode, there's still
|
||||||
# some issues w.r.t. JIT compilation. Therefore we need to
|
# some issues w.r.t. JIT compilation. Therefore we need to
|
||||||
# install build dependencies for JIT compilation.
|
# install build dependencies for JIT compilation.
|
||||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
# TODO: Remove this once FlashInfer AOT wheel is fixed
|
||||||
COPY requirements/build.txt requirements/build.txt
|
COPY requirements/build.txt requirements/build.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/build.txt
|
uv pip install --system -r requirements/build.txt \
|
||||||
|
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
#################### vLLM installation IMAGE ####################
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
@ -272,11 +288,13 @@ ADD . /vllm-workspace/
|
|||||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
ENV UV_HTTP_TIMEOUT=500
|
ENV UV_HTTP_TIMEOUT=500
|
||||||
|
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
|
||||||
# Workaround for #17068
|
# Workaround for #17068
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
|
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
||||||
|
|
||||||
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system -r requirements/dev.txt
|
uv pip install --system -r requirements/dev.txt
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
# Installation
|
# Installation
|
||||||
|
|
||||||
vLLM contains pre-compiled C++ and CUDA (12.1) binaries.
|
vLLM contains pre-compiled C++ and CUDA (12.6) binaries.
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
@ -23,12 +23,12 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I
|
|||||||
You can install vLLM using either `pip` or `uv pip`:
|
You can install vLLM using either `pip` or `uv pip`:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
# Install vLLM with CUDA 12.4.
|
# Install vLLM with CUDA 12.6.
|
||||||
pip install vllm # If you are using pip.
|
pip install vllm # If you are using pip.
|
||||||
uv pip install vllm # If you are using uv.
|
uv pip install vllm # If you are using uv.
|
||||||
```
|
```
|
||||||
|
|
||||||
As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.1, 11.8, and public PyTorch release versions:
|
As of now, vLLM's binaries are compiled with CUDA 12.6 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 11.8, and public PyTorch release versions:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
# Install vLLM with CUDA 11.8.
|
# Install vLLM with CUDA 11.8.
|
||||||
|
|||||||
@ -8,7 +8,7 @@ image:
|
|||||||
# -- Image tag
|
# -- Image tag
|
||||||
tag: "latest"
|
tag: "latest"
|
||||||
# -- Container launch command
|
# -- Container launch command
|
||||||
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "bfloat16", "--host", "0.0.0.0", "--port", "8000"]
|
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--dtype", "float32", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
|
|
||||||
# -- Container port
|
# -- Container port
|
||||||
containerPort: 8000
|
containerPort: 8000
|
||||||
|
|||||||
@ -6,7 +6,7 @@ requires = [
|
|||||||
"packaging",
|
"packaging",
|
||||||
"setuptools>=61",
|
"setuptools>=61",
|
||||||
"setuptools-scm>=8.0",
|
"setuptools-scm>=8.0",
|
||||||
"torch == 2.6.0",
|
"torch == 2.7.0",
|
||||||
"wheel",
|
"wheel",
|
||||||
"jinja2",
|
"jinja2",
|
||||||
]
|
]
|
||||||
|
|||||||
@ -4,6 +4,6 @@ ninja
|
|||||||
packaging
|
packaging
|
||||||
setuptools>=61
|
setuptools>=61
|
||||||
setuptools-scm>=8
|
setuptools-scm>=8
|
||||||
torch==2.6.0
|
torch==2.7.0
|
||||||
wheel
|
wheel
|
||||||
jinja2>=3.1.6
|
jinja2>=3.1.6
|
||||||
|
|||||||
@ -2,18 +2,19 @@
|
|||||||
-r common.txt
|
-r common.txt
|
||||||
|
|
||||||
# Dependencies for CPUs
|
# Dependencies for CPUs
|
||||||
torch==2.6.0+cpu; platform_machine == "x86_64"
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
torch==2.6.0; platform_system == "Darwin"
|
torch==2.7.0+cpu; platform_machine == "x86_64"
|
||||||
torch==2.6.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
torch==2.7.0; platform_system == "Darwin"
|
||||||
|
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
|
||||||
torch==2.7.0.dev20250304; platform_machine == "s390x"
|
torch==2.7.0.dev20250304; platform_machine == "s390x"
|
||||||
|
|
||||||
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
|
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
|
||||||
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
|
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||||
torchaudio==2.6.0; platform_machine == "ppc64le"
|
torchaudio==2.7.0; platform_machine == "ppc64le"
|
||||||
|
|
||||||
# required for the image processor of phi3v, this must be updated alongside torch
|
# required for the image processor of phi3v, this must be updated alongside torch
|
||||||
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
|
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||||
torchvision==0.21.0; platform_machine == "ppc64le"
|
torchvision==0.22.0; platform_machine == "ppc64le"
|
||||||
datasets # for benchmark scripts
|
datasets # for benchmark scripts
|
||||||
|
|
||||||
# cpu cannot use triton 3.3.0
|
# cpu cannot use triton 3.3.0
|
||||||
|
|||||||
@ -6,8 +6,9 @@ numba == 0.61.2; python_version > '3.9'
|
|||||||
|
|
||||||
# Dependencies for NVIDIA GPUs
|
# Dependencies for NVIDIA GPUs
|
||||||
ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
|
ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
|
||||||
torch==2.6.0
|
torch==2.7.0
|
||||||
torchaudio==2.6.0
|
torchaudio==2.7.0
|
||||||
# These must be updated alongside torch
|
# These must be updated alongside torch
|
||||||
torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||||
xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.6.0
|
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.30
|
||||||
|
xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
|
||||||
|
|||||||
@ -2,9 +2,9 @@
|
|||||||
-r common.txt
|
-r common.txt
|
||||||
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
|
--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
|
||||||
torch==2.6.0
|
torch==2.7.0
|
||||||
torchvision==0.21.0
|
torchvision==0.22.0
|
||||||
torchaudio==2.6.0
|
torchaudio==2.7.0
|
||||||
|
|
||||||
triton==3.2
|
triton==3.2
|
||||||
cmake>=3.26,<4
|
cmake>=3.26,<4
|
||||||
|
|||||||
@ -23,9 +23,9 @@ sentence-transformers # required for embedding tests
|
|||||||
soundfile # required for audio tests
|
soundfile # required for audio tests
|
||||||
jiwer # required for audio tests
|
jiwer # required for audio tests
|
||||||
timm # required for internvl test
|
timm # required for internvl test
|
||||||
torch==2.6.0
|
torch==2.7.0
|
||||||
torchaudio==2.6.0
|
torchaudio==2.7.0
|
||||||
torchvision==0.21.0
|
torchvision==0.22.0
|
||||||
transformers_stream_generator # required for qwen-vl test
|
transformers_stream_generator # required for qwen-vl test
|
||||||
mamba_ssm # required for plamo2 test
|
mamba_ssm # required for plamo2 test
|
||||||
matplotlib # required for qwen-vl test
|
matplotlib # required for qwen-vl test
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
# This file was autogenerated by uv via the following command:
|
# This file was autogenerated by uv via the following command:
|
||||||
# uv pip compile requirements/test.in -o requirements/test.txt
|
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match
|
||||||
absl-py==2.1.0
|
absl-py==2.1.0
|
||||||
# via rouge-score
|
# via rouge-score
|
||||||
accelerate==1.0.1
|
accelerate==1.0.1
|
||||||
@ -274,7 +274,7 @@ mamba-ssm==2.2.4
|
|||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
markdown-it-py==3.0.0
|
markdown-it-py==3.0.0
|
||||||
# via rich
|
# via rich
|
||||||
markupsafe==3.0.2
|
markupsafe==3.0.1
|
||||||
# via
|
# via
|
||||||
# jinja2
|
# jinja2
|
||||||
# werkzeug
|
# werkzeug
|
||||||
@ -355,39 +355,42 @@ numpy==1.26.4
|
|||||||
# transformers
|
# transformers
|
||||||
# tritonclient
|
# tritonclient
|
||||||
# vocos
|
# vocos
|
||||||
nvidia-cublas-cu12==12.4.5.8
|
nvidia-cublas-cu12==12.6.4.1
|
||||||
# via
|
# via
|
||||||
# nvidia-cudnn-cu12
|
# nvidia-cudnn-cu12
|
||||||
# nvidia-cusolver-cu12
|
# nvidia-cusolver-cu12
|
||||||
# torch
|
# torch
|
||||||
nvidia-cuda-cupti-cu12==12.4.127
|
nvidia-cuda-cupti-cu12==12.6.80
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cuda-nvrtc-cu12==12.4.127
|
nvidia-cuda-nvrtc-cu12==12.6.77
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cuda-runtime-cu12==12.4.127
|
nvidia-cuda-runtime-cu12==12.6.77
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cudnn-cu12==9.1.0.70
|
nvidia-cudnn-cu12==9.5.1.17
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cufft-cu12==11.2.1.3
|
nvidia-cufft-cu12==11.3.0.4
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-curand-cu12==10.3.5.147
|
nvidia-cufile-cu12==1.11.1.6
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cusolver-cu12==11.6.1.9
|
nvidia-curand-cu12==10.3.7.77
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-cusparse-cu12==12.3.1.170
|
nvidia-cusolver-cu12==11.7.1.2
|
||||||
|
# via torch
|
||||||
|
nvidia-cusparse-cu12==12.5.4.2
|
||||||
# via
|
# via
|
||||||
# nvidia-cusolver-cu12
|
# nvidia-cusolver-cu12
|
||||||
# torch
|
# torch
|
||||||
nvidia-cusparselt-cu12==0.6.2
|
nvidia-cusparselt-cu12==0.6.3
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-nccl-cu12==2.21.5
|
nvidia-nccl-cu12==2.26.2
|
||||||
# via torch
|
# via torch
|
||||||
nvidia-nvjitlink-cu12==12.4.127
|
nvidia-nvjitlink-cu12==12.6.85
|
||||||
# via
|
# via
|
||||||
|
# nvidia-cufft-cu12
|
||||||
# nvidia-cusolver-cu12
|
# nvidia-cusolver-cu12
|
||||||
# nvidia-cusparse-cu12
|
# nvidia-cusparse-cu12
|
||||||
# torch
|
# torch
|
||||||
nvidia-nvtx-cu12==12.4.127
|
nvidia-nvtx-cu12==12.6.77
|
||||||
# via torch
|
# via torch
|
||||||
opencv-python-headless==4.11.0.86
|
opencv-python-headless==4.11.0.86
|
||||||
# via
|
# via
|
||||||
@ -634,6 +637,7 @@ setuptools==75.8.0
|
|||||||
# mamba-ssm
|
# mamba-ssm
|
||||||
# pytablewriter
|
# pytablewriter
|
||||||
# torch
|
# torch
|
||||||
|
# triton
|
||||||
shellingham==1.5.4
|
shellingham==1.5.4
|
||||||
# via typer
|
# via typer
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
@ -664,7 +668,7 @@ starlette-testclient==0.4.1
|
|||||||
# via schemathesis
|
# via schemathesis
|
||||||
statsmodels==0.14.4
|
statsmodels==0.14.4
|
||||||
# via genai-perf
|
# via genai-perf
|
||||||
sympy==1.13.1
|
sympy==1.13.3
|
||||||
# via
|
# via
|
||||||
# einx
|
# einx
|
||||||
# torch
|
# torch
|
||||||
@ -696,7 +700,7 @@ tomli==2.2.1
|
|||||||
# via schemathesis
|
# via schemathesis
|
||||||
tomli-w==1.2.0
|
tomli-w==1.2.0
|
||||||
# via schemathesis
|
# via schemathesis
|
||||||
torch==2.6.0
|
torch==2.7.0
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
# accelerate
|
# accelerate
|
||||||
@ -714,12 +718,12 @@ torch==2.6.0
|
|||||||
# torchvision
|
# torchvision
|
||||||
# vector-quantize-pytorch
|
# vector-quantize-pytorch
|
||||||
# vocos
|
# vocos
|
||||||
torchaudio==2.6.0
|
torchaudio==2.7.0
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
# encodec
|
# encodec
|
||||||
# vocos
|
# vocos
|
||||||
torchvision==0.21.0
|
torchvision==0.22.0
|
||||||
# via
|
# via
|
||||||
# -r requirements/test.in
|
# -r requirements/test.in
|
||||||
# timm
|
# timm
|
||||||
@ -748,7 +752,7 @@ transformers==4.51.3
|
|||||||
# transformers-stream-generator
|
# transformers-stream-generator
|
||||||
transformers-stream-generator==0.0.5
|
transformers-stream-generator==0.0.5
|
||||||
# via -r requirements/test.in
|
# via -r requirements/test.in
|
||||||
triton==3.2.0
|
triton==3.3.0
|
||||||
# via torch
|
# via torch
|
||||||
tritonclient==2.51.0
|
tritonclient==2.51.0
|
||||||
# via
|
# via
|
||||||
|
|||||||
2
setup.py
2
setup.py
@ -54,7 +54,7 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None
|
|||||||
# fallback to cpu
|
# fallback to cpu
|
||||||
VLLM_TARGET_DEVICE = "cpu"
|
VLLM_TARGET_DEVICE = "cpu"
|
||||||
|
|
||||||
MAIN_CUDA_VERSION = "12.4"
|
MAIN_CUDA_VERSION = "12.8"
|
||||||
|
|
||||||
|
|
||||||
def is_sccache_available() -> bool:
|
def is_sccache_available() -> bool:
|
||||||
|
|||||||
@ -5,7 +5,8 @@ from typing import Dict, List, Optional, Tuple
|
|||||||
try:
|
try:
|
||||||
import intel_extension_for_pytorch.llm.modules as ipex_modules
|
import intel_extension_for_pytorch.llm.modules as ipex_modules
|
||||||
_use_ipex = True
|
_use_ipex = True
|
||||||
except ImportError:
|
# AttributeError is to handle a bug in ipex https://github.com/intel/intel-extension-for-pytorch/pull/813
|
||||||
|
except (ImportError, AttributeError):
|
||||||
_use_ipex = False
|
_use_ipex = False
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user