mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-21 05:07:02 +08:00
Merge branch 'main' into rob-fixes
This commit is contained in:
commit
144162fc8c
@ -361,7 +361,7 @@ main() {
|
||||
# get the current IP address, required by benchmark_serving.py
|
||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||
export VLLM_LOG_LEVEL="WARNING"
|
||||
export VLLM_LOGGING_LEVEL="WARNING"
|
||||
|
||||
# prepare for benchmarking
|
||||
cd benchmarks || exit 1
|
||||
|
||||
@ -1,16 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t openvino-test -f Dockerfile.openvino .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
@ -19,17 +19,19 @@ docker run --privileged --net host --shm-size=16G -it \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& export VLLM_USE_V1=1 \
|
||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||
&& echo TEST_1 \
|
||||
&& VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& echo TEST_2 \
|
||||
&& VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||
&& echo TEST_3 \
|
||||
&& VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||
&& echo TEST_4 \
|
||||
&& VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||
&& echo TEST_5 \
|
||||
&& VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
|
||||
|
||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py" \
|
||||
|
||||
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
|
||||
@ -1,29 +0,0 @@
|
||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
FROM ubuntu:22.04 AS dev
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y \
|
||||
git python3-pip \
|
||||
ffmpeg libsm6 libxext6 libgl1
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
RUN python3 -m pip install -U pip
|
||||
# install build requirements
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt
|
||||
# build vLLM with OpenVINO backend
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
|
||||
|
||||
COPY examples/ /workspace/examples
|
||||
COPY benchmarks/ /workspace/benchmarks
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@ -12,6 +12,8 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG FA_BRANCH="b7d29fb"
|
||||
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
|
||||
ARG AITER_BRANCH="21d47a9"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
|
||||
@ -129,8 +131,18 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
|
||||
ARG AITER_REPO
|
||||
ARG AITER_BRANCH
|
||||
RUN git clone --recursive ${AITER_REPO}
|
||||
RUN cd aiter \
|
||||
&& git checkout ${AITER_BRANCH} \
|
||||
&& git submodule update --init --recursive \
|
||||
&& pip install -r requirements.txt \
|
||||
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
|
||||
|
||||
ARG BASE_IMAGE
|
||||
ARG HIPBLASLT_BRANCH
|
||||
ARG HIPBLAS_COMMON_BRANCH
|
||||
ARG LEGACY_HIPBLASLT_OPTION
|
||||
ARG RCCL_BRANCH
|
||||
ARG RCCL_REPO
|
||||
@ -155,4 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
||||
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
||||
|
||||
14
README.md
14
README.md
@ -28,19 +28,7 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
||||
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
||||
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
||||
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
|
||||
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
|
||||
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
|
||||
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
|
||||
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
||||
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
||||
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
||||
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
||||
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||
- [2024/12] vLLM joins [PyTorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||
|
||||
---
|
||||
|
||||
|
||||
@ -24,7 +24,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
||||
// sum of squares
|
||||
float ss = 0.0f;
|
||||
|
||||
for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
@ -58,7 +58,7 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
|
||||
|
||||
float block_absmax_val_maybe = 0.0f;
|
||||
for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
@ -103,7 +103,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
||||
int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
|
||||
;
|
||||
|
||||
for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
|
||||
float x = static_cast<float>(input[token_offset + i]);
|
||||
if constexpr (has_residual) {
|
||||
x += static_cast<float>(residual[token_offset + i]);
|
||||
@ -142,7 +142,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
|
||||
int32_t const num_vec_elems = hidden_size >> 2;
|
||||
|
||||
#pragma unroll 4
|
||||
for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
vec4_t<scalar_t> in = vec_input[i];
|
||||
|
||||
vec4_t<float> x;
|
||||
@ -206,7 +206,7 @@ __device__ void compute_dynamic_per_token_scales(
|
||||
float block_absmax_val_maybe = 0.0f;
|
||||
|
||||
#pragma unroll 4
|
||||
for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
vec4_t<scalar_t> in = vec_input[i];
|
||||
vec4_t<scalar_t> const w = vec_weight[i];
|
||||
|
||||
@ -286,7 +286,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
|
||||
// TODO(luka/varun) extract into type-agnostic vectorized quant function to
|
||||
// replace scaled_fp8_conversion_vec
|
||||
#pragma unroll 4
|
||||
for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
|
||||
vec4_t<scalar_t> const in = vec_input[i];
|
||||
vec4_t<scalar_t> const w = vec_weight[i];
|
||||
|
||||
|
||||
@ -101,10 +101,10 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const auto i = blockIdx.x;
|
||||
const block_q2_K * x = (const block_q2_K *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const auto tid = threadIdx.x;
|
||||
const int n = tid/32;
|
||||
const int l = tid - 32*n;
|
||||
const int is = 8*n + l/16;
|
||||
@ -123,10 +123,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const auto i = blockIdx.x;
|
||||
const block_q3_K * x = (const block_q3_K *) vx;
|
||||
|
||||
const int r = threadIdx.x/4;
|
||||
const auto r = threadIdx.x/4;
|
||||
const int tid = r/2;
|
||||
const int is0 = r%2;
|
||||
const int l0 = 16*is0 + 4*(threadIdx.x%4);
|
||||
@ -164,10 +164,10 @@ template<typename dst_t>
|
||||
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q4_K * x = (const block_q4_K *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const auto i = blockIdx.x;
|
||||
|
||||
// assume 32 threads
|
||||
const int tid = threadIdx.x;
|
||||
const auto tid = threadIdx.x;
|
||||
const int il = tid/8;
|
||||
const int ir = tid%8;
|
||||
const int is = 2*il;
|
||||
@ -197,10 +197,10 @@ template<typename dst_t>
|
||||
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q5_K * x = (const block_q5_K *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const auto i = blockIdx.x;
|
||||
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int tid = threadIdx.x;
|
||||
const auto tid = threadIdx.x;
|
||||
const int il = tid/16; // il is in 0...3
|
||||
const int ir = tid%16; // ir is in 0...15
|
||||
const int is = 2*il; // is is in 0...6
|
||||
@ -231,10 +231,10 @@ template<typename dst_t>
|
||||
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const block_q6_K * x = (const block_q6_K *) vx;
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const auto i = blockIdx.x;
|
||||
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int tid = threadIdx.x;
|
||||
const auto tid = threadIdx.x;
|
||||
const int ip = tid/32; // ip is 0 or 1
|
||||
const int il = tid - 32*ip; // 0...32
|
||||
const int is = 8*ip + il/16;
|
||||
@ -256,10 +256,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const auto i = blockIdx.x;
|
||||
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const auto tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@ -275,10 +275,10 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const auto i = blockIdx.x;
|
||||
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const auto tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@ -293,10 +293,10 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const auto i = blockIdx.x;
|
||||
const block_iq2_s * x = (const block_iq2_s *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const auto tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@ -309,10 +309,10 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const auto i = blockIdx.x;
|
||||
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const auto tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@ -332,10 +332,10 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const auto i = blockIdx.x;
|
||||
const block_iq3_s * x = (const block_iq3_s *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const auto tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@ -399,10 +399,10 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const auto i = blockIdx.x;
|
||||
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const auto tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
@ -417,10 +417,10 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const int i = blockIdx.x;
|
||||
const auto i = blockIdx.x;
|
||||
const block_iq4_xs * x = (const block_iq4_xs *)vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const auto tid = threadIdx.x;
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
|
||||
@ -565,4 +565,4 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -19,11 +19,11 @@ template <typename scalar_t>
|
||||
static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
|
||||
void* __restrict__ vy, const int kx,
|
||||
const int kx_padded) {
|
||||
const int ix = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const auto ix = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (ix >= kx_padded) {
|
||||
return;
|
||||
}
|
||||
const int iy = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
const auto iy = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
const int i_padded = iy * kx_padded + ix;
|
||||
|
||||
block_q8_1* y = (block_q8_1*)vy;
|
||||
|
||||
@ -14,10 +14,10 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||
|
||||
const int & ncols_dst = ncols_y;
|
||||
|
||||
const int row_dst_0 = blockIdx.x*mmq_y;
|
||||
const auto row_dst_0 = blockIdx.x*mmq_y;
|
||||
const int & row_x_0 = row_dst_0;
|
||||
|
||||
const int col_dst_0 = blockIdx.y*mmq_x;
|
||||
const auto col_dst_0 = blockIdx.y*mmq_x;
|
||||
const int & col_y_0 = col_dst_0;
|
||||
|
||||
int * tile_x_ql = nullptr;
|
||||
@ -39,7 +39,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||
|
||||
#pragma unroll
|
||||
for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
|
||||
const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
|
||||
const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
|
||||
const int kbxd = kqs / QI8_1;
|
||||
|
||||
#pragma unroll
|
||||
@ -53,7 +53,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||
#pragma unroll
|
||||
for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
|
||||
const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
|
||||
const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
|
||||
const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
|
||||
const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
|
||||
|
||||
// if the sum is not needed it's faster to transform the scale to f32 ahead of time
|
||||
@ -87,14 +87,14 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < mmq_x; j += nwarps) {
|
||||
const int col_dst = col_dst_0 + j + threadIdx.y;
|
||||
const auto col_dst = col_dst_0 + j + threadIdx.y;
|
||||
if (col_dst >= ncols_dst) {
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
|
||||
const int row_dst = row_dst_0 + threadIdx.x + i;
|
||||
const auto row_dst = row_dst_0 + threadIdx.x + i;
|
||||
if (row_dst >= nrows_dst) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
|
||||
template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) {
|
||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
const auto row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
@ -16,7 +16,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
||||
const block_q_t * x = (const block_q_t *) vx;
|
||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||
|
||||
for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
|
||||
for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
|
||||
const int ibx = row*blocks_per_row + i; // x block index
|
||||
|
||||
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
||||
|
||||
@ -19,10 +19,10 @@ static __device__ __forceinline__ void moe_q(
|
||||
|
||||
const int ncols_dst = ncols_y * top_k;
|
||||
|
||||
const int row_dst_0 = blockIdx.x * mmq_y;
|
||||
const auto row_dst_0 = blockIdx.x * mmq_y;
|
||||
const int& row_x_0 = row_dst_0;
|
||||
|
||||
const int col_dst_0 = blockIdx.y * mmq_x;
|
||||
const auto col_dst_0 = blockIdx.y * mmq_x;
|
||||
|
||||
int token_offs[mmq_x / nwarps];
|
||||
for (int i = 0; i < mmq_x; i += nwarps) {
|
||||
@ -56,7 +56,7 @@ static __device__ __forceinline__ void moe_q(
|
||||
const int n_per_r = ((qk * blocks_per_warp) / qr);
|
||||
#pragma unroll
|
||||
for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
|
||||
const int kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
|
||||
const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
|
||||
const int kbxd = kqs / QI8_1;
|
||||
|
||||
#pragma unroll
|
||||
@ -73,7 +73,7 @@ static __device__ __forceinline__ void moe_q(
|
||||
}
|
||||
|
||||
if (threadIdx.x < n_per_r / QK8_1) {
|
||||
const int kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
|
||||
const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
|
||||
const int col_y_eff = token_offs[threadIdx.y] / top_k;
|
||||
const int block_x =
|
||||
ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
|
||||
@ -119,7 +119,7 @@ static __device__ __forceinline__ void moe_q(
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
|
||||
const int row_dst = row_dst_0 + threadIdx.x + i;
|
||||
const auto row_dst = row_dst_0 + threadIdx.x + i;
|
||||
if (row_dst >= nrows_dst) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -199,12 +199,12 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
|
||||
MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
int t = threadIdx.x;
|
||||
auto t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
int offset_m = blockIdx.y * m_count;
|
||||
int offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
auto offset_m = blockIdx.y * m_count;
|
||||
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
|
||||
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
|
||||
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
|
||||
@ -337,12 +337,12 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
|
||||
MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
int t = threadIdx.x;
|
||||
auto t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
int offset_m = blockIdx.y * m_count;
|
||||
int offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
auto offset_m = blockIdx.y * m_count;
|
||||
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
|
||||
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
|
||||
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
|
||||
@ -458,12 +458,12 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
|
||||
MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
int t = threadIdx.x;
|
||||
auto t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
int offset_m = blockIdx.y * m_count;
|
||||
int offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
auto offset_m = blockIdx.y * m_count;
|
||||
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
|
||||
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
|
||||
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
|
||||
@ -586,12 +586,12 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
|
||||
MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
int t = threadIdx.x;
|
||||
auto t = threadIdx.x;
|
||||
|
||||
// Block
|
||||
int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
int offset_m = blockIdx.y * m_count;
|
||||
int offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
|
||||
auto offset_m = blockIdx.y * m_count;
|
||||
auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
|
||||
|
||||
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
|
||||
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
|
||||
@ -765,14 +765,14 @@ __global__ void reconstruct_exllama_8bit_kernel(
|
||||
MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
int offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
|
||||
|
||||
// Preload remapping table
|
||||
__shared__ int perm[BLOCK_KN_SIZE];
|
||||
int t = threadIdx.x;
|
||||
auto t = threadIdx.x;
|
||||
|
||||
if (b_q_perm) {
|
||||
if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
|
||||
@ -862,14 +862,14 @@ __global__ void reconstruct_exllama_4bit_kernel(
|
||||
MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
int offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
|
||||
|
||||
// Preload remapping table
|
||||
__shared__ int perm[BLOCK_KN_SIZE];
|
||||
int t = threadIdx.x;
|
||||
auto t = threadIdx.x;
|
||||
|
||||
if (b_q_perm) {
|
||||
if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
|
||||
@ -967,14 +967,14 @@ __global__ void reconstruct_exllama_3bit_kernel(
|
||||
MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
int offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
|
||||
|
||||
// Preload remapping table
|
||||
__shared__ int perm[BLOCK_KN_SIZE];
|
||||
int t = threadIdx.x;
|
||||
auto t = threadIdx.x;
|
||||
|
||||
if (b_q_perm) {
|
||||
if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
|
||||
@ -1065,14 +1065,14 @@ __global__ void reconstruct_exllama_2bit_kernel(
|
||||
MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
|
||||
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
|
||||
|
||||
int offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
|
||||
auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
|
||||
|
||||
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
|
||||
|
||||
// Preload remapping table
|
||||
__shared__ int perm[BLOCK_KN_SIZE];
|
||||
int t = threadIdx.x;
|
||||
auto t = threadIdx.x;
|
||||
|
||||
if (b_q_perm) {
|
||||
if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
|
||||
@ -1181,11 +1181,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
|
||||
int zero_width = width / 8;
|
||||
int vec_height = height * 4;
|
||||
const int blockwidth2 = BLOCK_KN_SIZE / 2;
|
||||
int b = blockIdx.y * BLOCK_M_SIZE_MAX;
|
||||
auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
|
||||
int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
|
||||
int h = BLOCK_KN_SIZE * blockIdx.z / 8;
|
||||
auto h = BLOCK_KN_SIZE * blockIdx.z / 8;
|
||||
int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
|
||||
int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
|
||||
__shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
|
||||
if (threadIdx.x < h_end) {
|
||||
@ -1197,8 +1197,8 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
|
||||
}
|
||||
|
||||
__shared__ half2 deq2[256][8];
|
||||
int val = threadIdx.x / 8;
|
||||
int off = threadIdx.x % 8;
|
||||
auto val = threadIdx.x / 8;
|
||||
auto off = threadIdx.x % 8;
|
||||
for (; val < 256; val += BLOCK_KN_SIZE / 8) {
|
||||
deq2[val][off] =
|
||||
__halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4));
|
||||
@ -1280,11 +1280,11 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
|
||||
int zero_width = width / 4;
|
||||
int vec_height = height * 2;
|
||||
const int blockwidth2 = BLOCK_KN_SIZE / 2;
|
||||
int b = blockIdx.y * BLOCK_M_SIZE_MAX;
|
||||
auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
|
||||
int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
|
||||
int h = BLOCK_KN_SIZE * blockIdx.z / 4;
|
||||
auto h = BLOCK_KN_SIZE * blockIdx.z / 4;
|
||||
int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
|
||||
int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
|
||||
__shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
|
||||
if (threadIdx.x < h_end) {
|
||||
@ -1393,8 +1393,8 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
|
||||
half* __restrict__ out) {
|
||||
// Start of block
|
||||
|
||||
int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
int row = blockIdx.y * 32 / bit;
|
||||
auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
auto row = blockIdx.y * 32 / bit;
|
||||
if (column >= width) return;
|
||||
|
||||
// Views
|
||||
@ -1425,8 +1425,8 @@ __global__ void reconstruct_gptq_3bit_kernel(
|
||||
const int height, const int width, const int group,
|
||||
half* __restrict__ out) {
|
||||
// Start of block
|
||||
int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
int row = blockIdx.y * 32;
|
||||
auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
|
||||
auto row = blockIdx.y * 32;
|
||||
if (column >= width) return;
|
||||
|
||||
// Views
|
||||
@ -1542,7 +1542,7 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
|
||||
|
||||
__global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
const int size_k, const int size_n) {
|
||||
int n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
auto n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
if (n >= size_n) return;
|
||||
int k = 0;
|
||||
uint32_t* b_ptr = b_q_weight + n;
|
||||
@ -1555,7 +1555,7 @@ __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
|
||||
__global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
const int size_k, const int size_n) {
|
||||
int n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
auto n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
if (n >= size_n) return;
|
||||
int k = 0;
|
||||
uint32_t* b_ptr = b_q_weight + n;
|
||||
@ -1568,7 +1568,7 @@ __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
|
||||
__global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
const int size_k, const int size_n) {
|
||||
int n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
auto n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
if (n >= size_n) return;
|
||||
int k = 0;
|
||||
uint32_t* b_ptr = b_q_weight + n;
|
||||
@ -1581,7 +1581,7 @@ __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
|
||||
__global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight,
|
||||
const int size_k, const int size_n) {
|
||||
int n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
auto n = blockIdx.x * THREADS_X + threadIdx.x;
|
||||
if (n >= size_n) return;
|
||||
int k = 0;
|
||||
uint32_t* b_ptr = b_q_weight + n;
|
||||
@ -1599,9 +1599,9 @@ __global__ void make_sequential_4bit_kernel(const uint32_t* __restrict__ w,
|
||||
const uint64_t* w2 = (uint64_t*)w;
|
||||
uint64_t* w_new2 = (uint64_t*)w_new;
|
||||
int w2_stride = w_width >> 1;
|
||||
int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
if (w2_column >= w2_stride) return;
|
||||
int w_new2_row = blockIdx.y;
|
||||
auto w_new2_row = blockIdx.y;
|
||||
int q_perm_idx = w_new2_row << 3;
|
||||
uint64_t dst = 0;
|
||||
|
||||
@ -1630,9 +1630,9 @@ __global__ void make_sequential_2bit_kernel(const uint32_t* __restrict__ w,
|
||||
const uint64_t* w2 = (uint64_t*)w;
|
||||
uint64_t* w_new2 = (uint64_t*)w_new;
|
||||
int w2_stride = w_width >> 1;
|
||||
int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
if (w2_column >= w2_stride) return;
|
||||
int w_new2_row = blockIdx.y;
|
||||
auto w_new2_row = blockIdx.y;
|
||||
int q_perm_idx = w_new2_row << 4;
|
||||
uint64_t dst = 0;
|
||||
|
||||
@ -1658,10 +1658,10 @@ __global__ void make_sequential_3bit_kernel(const uint32_t* __restrict__ w,
|
||||
uint32_t* __restrict__ w_new,
|
||||
const int* __restrict__ q_perm,
|
||||
const int w_width) {
|
||||
int w_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
auto w_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
if (w_column >= w_width) return;
|
||||
int w_new_row = blockIdx.y * 3;
|
||||
int q_perm_idx = blockIdx.y << 5;
|
||||
auto w_new_row = blockIdx.y * 3;
|
||||
auto q_perm_idx = blockIdx.y << 5;
|
||||
uint32_t dst[3] = {0, 0, 0};
|
||||
|
||||
#pragma unroll
|
||||
@ -1744,9 +1744,9 @@ __global__ void make_sequential_8bit_kernel(const uint32_t* __restrict__ w,
|
||||
const uint64_t* w2 = (uint64_t*)w;
|
||||
uint64_t* w_new2 = (uint64_t*)w_new;
|
||||
int w2_stride = w_width >> 1;
|
||||
int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
|
||||
if (w2_column >= w2_stride) return;
|
||||
int w_new2_row = blockIdx.y;
|
||||
auto w_new2_row = blockIdx.y;
|
||||
int q_perm_idx = w_new2_row << 2;
|
||||
uint64_t dst = 0;
|
||||
|
||||
|
||||
@ -55,11 +55,11 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K +
|
||||
blockIdx.z * params.SplitK * 4;
|
||||
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
const auto lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
// For matrix A, a block load/store Mtile(row) x 32(col) elements in
|
||||
// multiple iters, 8x4 warp load/store 8(row) x 32(col) elements per iter
|
||||
const int Aldg_row_base_idx = threadIdx.x / 4;
|
||||
const auto Aldg_row_base_idx = threadIdx.x / 4;
|
||||
Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A;
|
||||
const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx;
|
||||
|
||||
@ -67,7 +67,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
// elements of N32K16 packing in multiple iters, 4x8 warp load/store 4(row)
|
||||
// * 128(col) per iter
|
||||
Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B;
|
||||
const int Bldg_row_base_idx = threadIdx.x / 8;
|
||||
const auto Bldg_row_base_idx = threadIdx.x / 8;
|
||||
const int Bldg_base_offset =
|
||||
Bldg_row_base_idx * params.K * 4 + Bldg_col_idx;
|
||||
|
||||
@ -89,7 +89,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
B_ldg_guard = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
|
||||
int m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
|
||||
auto m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
|
||||
if (m_idx < params.M) {
|
||||
A_ldg_guard |= (1u << i);
|
||||
}
|
||||
@ -98,8 +98,8 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
const int N_padded = (params.N + 31) / 32 * 32;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
|
||||
int n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
|
||||
i * N_SIZE_ONE_LOAD;
|
||||
auto n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
|
||||
i * N_SIZE_ONE_LOAD;
|
||||
if (n_idx < N_padded) {
|
||||
B_ldg_guard |= (1u << i);
|
||||
}
|
||||
@ -355,7 +355,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
__device__ void fused_splitk_reduce() {
|
||||
// need splitk-reduce if enable splitk
|
||||
if (gridDim.z > 1) {
|
||||
int blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
|
||||
auto blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
|
||||
// Wait for all previous blocks in the splitk direction to accumulate the
|
||||
// results into C_tmp
|
||||
if (threadIdx.x == 0) {
|
||||
@ -371,7 +371,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
int C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
|
||||
auto C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
|
||||
if (blockIdx.z != 0) {
|
||||
// expecting that temporary register here reuses the previous A&B frag
|
||||
// register
|
||||
@ -456,7 +456,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
|
||||
|
||||
FType* C_base_ptr = this_block_C_base_ptr + store_c_base_offset;
|
||||
// C_tile lds and stg
|
||||
int m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
|
||||
auto m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
|
||||
bool n_guard = (store_c_col_idx + blockIdx.y * Ntile) < params.N;
|
||||
if (WARP_NTILE == 32) {
|
||||
int lds_c_base_offset = warp_id * Mtile * WARP_NTILE +
|
||||
@ -580,9 +580,9 @@ __global__ void __launch_bounds__(BLOCK)
|
||||
int sts_stage_idx = 0;
|
||||
int lds_stage_idx = 0;
|
||||
|
||||
int tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
|
||||
? params.SplitK
|
||||
: params.K - blockIdx.z * params.SplitK;
|
||||
auto tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
|
||||
? params.SplitK
|
||||
: params.K - blockIdx.z * params.SplitK;
|
||||
int k_tiles = (tb_k_slice + 31) / 32;
|
||||
int first_k_tile = tb_k_slice - (k_tiles - 1) * 32;
|
||||
|
||||
@ -777,13 +777,13 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
|
||||
const QT* qdata, const FT* scales, const FT* zeros, FT* fdata,
|
||||
const int N_32align, const int N, const int K) {
|
||||
__shared__ FT smem[64 * 32];
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int lane_id = threadIdx.x % 32;
|
||||
const int src_row_idx = blockIdx.x * 8 + lane_id / 4;
|
||||
auto warp_id = threadIdx.x / 32;
|
||||
auto lane_id = threadIdx.x % 32;
|
||||
const auto src_row_idx = blockIdx.x * 8 + lane_id / 4;
|
||||
const int src_col_idx =
|
||||
blockIdx.y * 64 * 4 + warp_id * 16 * 4 + (lane_id % 4) * 16;
|
||||
const int src_offset = src_row_idx * K * 4 + src_col_idx;
|
||||
int params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;
|
||||
auto params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;
|
||||
|
||||
QT qval_reg[16];
|
||||
const QT* pdata = qdata + src_offset;
|
||||
@ -829,8 +829,8 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
|
||||
*reinterpret_cast<uint4*>(smem + lds_base_offset + i * 32 * 32);
|
||||
}
|
||||
|
||||
const int dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
|
||||
const int dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
|
||||
const auto dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
|
||||
const auto dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
int dst_row_kidx = dst_row_base_kidx + i * 32;
|
||||
@ -1008,4 +1008,4 @@ torch::Tensor allspark_w8a16_gemm(
|
||||
|
||||
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
|
||||
m.impl("allspark_w8a16_gemm", &allspark_w8a16_gemm);
|
||||
}
|
||||
}
|
||||
|
||||
@ -13,8 +13,8 @@ __global__ void __launch_bounds__(128)
|
||||
const uint8_t* B, const FType* B_scale, const FType* B_zero,
|
||||
uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
|
||||
const int K, const int N, const int N_32align) {
|
||||
const int lane_id = threadIdx.x % 32;
|
||||
const int warp_id = threadIdx.x / 32;
|
||||
const auto lane_id = threadIdx.x % 32;
|
||||
const auto warp_id = threadIdx.x / 32;
|
||||
|
||||
if (blockIdx.x != gridDim.x - 1) {
|
||||
// Load B
|
||||
@ -50,7 +50,7 @@ __global__ void __launch_bounds__(128)
|
||||
}
|
||||
|
||||
// Store B
|
||||
const int dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
|
||||
const auto dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
|
||||
const int dst_col_idx =
|
||||
blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
@ -65,7 +65,7 @@ __global__ void __launch_bounds__(128)
|
||||
} else {
|
||||
// Load B_scale and B_zero
|
||||
FType b_scale_reg, b_zero_reg;
|
||||
int src_offset = blockIdx.y * 128 + threadIdx.x;
|
||||
auto src_offset = blockIdx.y * 128 + threadIdx.x;
|
||||
ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N);
|
||||
if (B_zero != nullptr)
|
||||
ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N);
|
||||
|
||||
@ -62,7 +62,7 @@ template <typename FType, int BLOCK, int N_MATRIX>
|
||||
__global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
|
||||
uint32_t n, uint32_t n_matrix,
|
||||
uint32_t matrix_size) {
|
||||
int idx = blockIdx.x * BLOCK + threadIdx.x;
|
||||
auto idx = blockIdx.x * BLOCK + threadIdx.x;
|
||||
|
||||
if (idx >= matrix_size) {
|
||||
return;
|
||||
@ -407,4 +407,4 @@ static __device__ half2 inline num2num2(const half x) {
|
||||
return __half2half2(x);
|
||||
}
|
||||
|
||||
} // namespace allspark
|
||||
} // namespace allspark
|
||||
|
||||
@ -124,3 +124,52 @@ nsys stats report1.nsys-rep
|
||||
GUI example:
|
||||
|
||||
<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
|
||||
|
||||
## Profiling vLLM Python Code
|
||||
|
||||
The Python standard library includes
|
||||
[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
|
||||
code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
|
||||
Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
|
||||
used to profile a section of code.
|
||||
|
||||
### Example usage - decorator
|
||||
|
||||
The first helper is a Python decorator that can be used to profile a function.
|
||||
If a filename is specified, the profile will be saved to that file. If no filename is
|
||||
specified, profile data will be printed to stdout.
|
||||
|
||||
```python
|
||||
import vllm.utils
|
||||
|
||||
@vllm.utils.cprofile("expensive_function.prof")
|
||||
def expensive_function():
|
||||
# some expensive code
|
||||
pass
|
||||
```
|
||||
|
||||
### Example Usage - context manager
|
||||
|
||||
The second helper is a context manager that can be used to profile a block of
|
||||
code. Similar to the decorator, the filename is optional.
|
||||
|
||||
```python
|
||||
import vllm.utils
|
||||
|
||||
def another_function():
|
||||
# more expensive code
|
||||
pass
|
||||
|
||||
with vllm.utils.cprofile_context("another_function.prof"):
|
||||
another_function()
|
||||
```
|
||||
|
||||
### Analyzing Profile Results
|
||||
|
||||
There are multiple tools available that can help analyze the profile results.
|
||||
One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
|
||||
|
||||
```bash
|
||||
pip install snakeviz
|
||||
snakeviz expensive_function.prof
|
||||
```
|
||||
|
||||
@ -7,5 +7,192 @@ A major use case is for multi-host/multi-node distributed inference.
|
||||
|
||||
vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.
|
||||
|
||||
Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on
|
||||
deploying vLLM on Kubernetes using LWS.
|
||||
## Prerequisites
|
||||
|
||||
* At least two Kubernetes nodes, each with 8 GPUs, are required.
|
||||
* Install LWS by following the instructions found [here](https://lws.sigs.k8s.io/docs/installation/).
|
||||
|
||||
## Deploy and Serve
|
||||
|
||||
Deploy the following yaml file `lws.yaml`
|
||||
|
||||
```yaml
|
||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||
kind: LeaderWorkerSet
|
||||
metadata:
|
||||
name: vllm
|
||||
spec:
|
||||
replicas: 2
|
||||
leaderWorkerTemplate:
|
||||
size: 2
|
||||
restartPolicy: RecreateGroupOnPodRestart
|
||||
leaderTemplate:
|
||||
metadata:
|
||||
labels:
|
||||
role: leader
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-leader
|
||||
image: docker.io/vllm/vllm-openai:latest
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: <your-hf-token>
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
|
||||
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
memory: 1124Gi
|
||||
ephemeral-storage: 800Gi
|
||||
requests:
|
||||
ephemeral-storage: 800Gi
|
||||
cpu: 125
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
readinessProbe:
|
||||
tcpSocket:
|
||||
port: 8080
|
||||
initialDelaySeconds: 15
|
||||
periodSeconds: 10
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 15Gi
|
||||
workerTemplate:
|
||||
spec:
|
||||
containers:
|
||||
- name: vllm-worker
|
||||
image: docker.io/vllm/vllm-openai:latest
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "8"
|
||||
memory: 1124Gi
|
||||
ephemeral-storage: 800Gi
|
||||
requests:
|
||||
ephemeral-storage: 800Gi
|
||||
cpu: 125
|
||||
env:
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
value: <your-hf-token>
|
||||
volumeMounts:
|
||||
- mountPath: /dev/shm
|
||||
name: dshm
|
||||
volumes:
|
||||
- name: dshm
|
||||
emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 15Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: vllm-leader
|
||||
spec:
|
||||
ports:
|
||||
- name: http
|
||||
port: 8080
|
||||
protocol: TCP
|
||||
targetPort: 8080
|
||||
selector:
|
||||
leaderworkerset.sigs.k8s.io/name: vllm
|
||||
role: leader
|
||||
type: ClusterIP
|
||||
```
|
||||
|
||||
```bash
|
||||
kubectl apply -f lws.yaml
|
||||
```
|
||||
|
||||
Verify the status of the pods:
|
||||
|
||||
```bash
|
||||
kubectl get pods
|
||||
```
|
||||
|
||||
Should get an output similar to this:
|
||||
|
||||
```bash
|
||||
NAME READY STATUS RESTARTS AGE
|
||||
vllm-0 1/1 Running 0 2s
|
||||
vllm-0-1 1/1 Running 0 2s
|
||||
vllm-1 1/1 Running 0 2s
|
||||
vllm-1-1 1/1 Running 0 2s
|
||||
```
|
||||
|
||||
Verify that the distributed tensor-parallel inference works:
|
||||
|
||||
```bash
|
||||
kubectl logs vllm-0 |grep -i "Loading model weights took"
|
||||
```
|
||||
|
||||
Should get something similar to this:
|
||||
|
||||
```text
|
||||
INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB
|
||||
(RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB
|
||||
```
|
||||
|
||||
## Access ClusterIP service
|
||||
|
||||
```bash
|
||||
# Listen on port 8080 locally, forwarding to the targetPort of the service's port 8080 in a pod selected by the service
|
||||
kubectl port-forward svc/vllm-leader 8080:8080
|
||||
```
|
||||
|
||||
The output should be similar to the following:
|
||||
|
||||
```text
|
||||
Forwarding from 127.0.0.1:8080 -> 8080
|
||||
Forwarding from [::1]:8080 -> 8080
|
||||
```
|
||||
|
||||
## Serve the model
|
||||
|
||||
Open another terminal and send a request
|
||||
|
||||
```text
|
||||
curl http://localhost:8080/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
|
||||
"prompt": "San Francisco is a",
|
||||
"max_tokens": 7,
|
||||
"temperature": 0
|
||||
}'
|
||||
```
|
||||
|
||||
The output should be similar to the following
|
||||
|
||||
```text
|
||||
{
|
||||
"id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
|
||||
"object": "text_completion",
|
||||
"created": 1715138766,
|
||||
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"text": " top destination for foodies, with",
|
||||
"logprobs": null,
|
||||
"finish_reason": "length",
|
||||
"stop_reason": null
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 5,
|
||||
"total_tokens": 12,
|
||||
"completion_tokens": 7
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
@ -25,7 +25,7 @@ import torch
|
||||
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
|
||||
model_id = "unsloth/tinyllama-bnb-4bit"
|
||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
|
||||
quantization="bitsandbytes", load_format="bitsandbytes")
|
||||
quantization="bitsandbytes")
|
||||
```
|
||||
|
||||
## Inflight quantization: load as 4bit quantization
|
||||
@ -35,7 +35,7 @@ from vllm import LLM
|
||||
import torch
|
||||
model_id = "huggyllama/llama-7b"
|
||||
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
|
||||
quantization="bitsandbytes", load_format="bitsandbytes")
|
||||
quantization="bitsandbytes")
|
||||
```
|
||||
|
||||
## OpenAI Compatible Server
|
||||
@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes")
|
||||
Append the following to your 4bit model arguments:
|
||||
|
||||
```console
|
||||
--quantization bitsandbytes --load-format bitsandbytes
|
||||
--quantization bitsandbytes
|
||||
```
|
||||
|
||||
@ -26,4 +26,3 @@ installation/ai_accelerator
|
||||
- Google TPU
|
||||
- Intel Gaudi
|
||||
- AWS Neuron
|
||||
- OpenVINO
|
||||
|
||||
@ -36,16 +36,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "# Installation"
|
||||
:end-before: "## Requirements"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Requirements
|
||||
@ -83,16 +73,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "## Requirements"
|
||||
:end-before: "## Set up using Python"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Configure a new environment
|
||||
@ -130,14 +110,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} python_env_setup.inc.md
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Set up using Python
|
||||
@ -177,16 +149,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "### Pre-built wheels"
|
||||
:end-before: "### Build wheel from source"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
### Build wheel from source
|
||||
@ -224,16 +186,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "### Build wheel from source"
|
||||
:end-before: "## Set up using Docker"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Set up using Docker
|
||||
@ -273,16 +225,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "### Pre-built images"
|
||||
:end-before: "### Build image from source"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
### Build image from source
|
||||
@ -320,16 +262,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "### Build image from source"
|
||||
:end-before: "## Extra information"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Extra information
|
||||
@ -364,13 +296,4 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "## Extra information"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
@ -1,110 +0,0 @@
|
||||
# Installation
|
||||
|
||||
vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
|
||||
|
||||
:::{attention}
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
:::
|
||||
|
||||
## Requirements
|
||||
|
||||
- OS: Linux
|
||||
- Instruction set architecture (ISA) requirement: at least AVX2.
|
||||
|
||||
## Set up using Python
|
||||
|
||||
### Pre-built wheels
|
||||
|
||||
Currently, there are no pre-built OpenVINO wheels.
|
||||
|
||||
### Build wheel from source
|
||||
|
||||
First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run:
|
||||
|
||||
```console
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install python3
|
||||
pip install --upgrade pip
|
||||
```
|
||||
|
||||
Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend installation:
|
||||
|
||||
```console
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
|
||||
Finally, install vLLM with OpenVINO backend:
|
||||
|
||||
```console
|
||||
PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
|
||||
```
|
||||
|
||||
:::{tip}
|
||||
To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
|
||||
:::
|
||||
|
||||
## Set up using Docker
|
||||
|
||||
### Pre-built images
|
||||
|
||||
Currently, there are no pre-built OpenVINO images.
|
||||
|
||||
### Build image from source
|
||||
|
||||
```console
|
||||
docker build -f Dockerfile.openvino -t vllm-openvino-env .
|
||||
docker run -it --rm vllm-openvino-env
|
||||
```
|
||||
|
||||
## Extra information
|
||||
|
||||
## Supported features
|
||||
|
||||
OpenVINO vLLM backend supports the following advanced vLLM features:
|
||||
|
||||
- Prefix caching (`--enable-prefix-caching`)
|
||||
- Chunked prefill (`--enable-chunked-prefill`)
|
||||
|
||||
## Performance tips
|
||||
|
||||
### vLLM OpenVINO backend environment variables
|
||||
|
||||
- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default.
|
||||
- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
|
||||
|
||||
### CPU performance tips
|
||||
|
||||
CPU uses the following environment variables to control behavior:
|
||||
|
||||
- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
|
||||
- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
|
||||
|
||||
To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
|
||||
|
||||
OpenVINO best known configuration for CPU is:
|
||||
|
||||
```console
|
||||
$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
|
||||
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
|
||||
```
|
||||
|
||||
### GPU performance tips
|
||||
|
||||
GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache).
|
||||
|
||||
Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
|
||||
|
||||
OpenVINO best known configuration for GPU is:
|
||||
|
||||
```console
|
||||
$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
|
||||
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- LoRA serving is not supported.
|
||||
- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
|
||||
- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
|
||||
@ -472,6 +472,11 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `TeleFLMForCausalLM`
|
||||
* TeleFLM
|
||||
* `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `XverseForCausalLM`
|
||||
* XVERSE
|
||||
* `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
|
||||
|
||||
@ -29,6 +29,11 @@ completion = client.chat.completions.create(
|
||||
print(completion.choices[0].message)
|
||||
```
|
||||
|
||||
:::{tip}
|
||||
vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
|
||||
You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
|
||||
:::
|
||||
|
||||
## Supported APIs
|
||||
|
||||
We currently support the following OpenAI APIs:
|
||||
|
||||
@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
|
||||
engine_args = EngineArgs(model=model,
|
||||
quantization=quantization,
|
||||
qlora_adapter_name_or_path=lora_repo,
|
||||
load_format="bitsandbytes",
|
||||
enable_lora=True,
|
||||
max_lora_rank=64)
|
||||
else:
|
||||
|
||||
36
examples/offline_inference/reproduciblity.py
Normal file
36
examples/offline_inference/reproduciblity.py
Normal file
@ -0,0 +1,36 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import os
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# vLLM does not guarantee the reproducibility of the results by default,
|
||||
# for the sake of performance. You need to do the following to achieve
|
||||
# reproducible results:
|
||||
# 1. Turn off multiprocessing to make the scheduling deterministic.
|
||||
# NOTE(woosuk): This is not needed and will be ignored for V0.
|
||||
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
||||
# 2. Fix the global seed for reproducibility. The default seed is None, which is
|
||||
# not reproducible.
|
||||
SEED = 42
|
||||
|
||||
# NOTE(woosuk): Even with the above two settings, vLLM only provides
|
||||
# reproducibility when it runs on the same hardware and the same vLLM version.
|
||||
# Also, the online serving API (`vllm serve`) does not support reproducibility
|
||||
# because it is almost impossible to make the scheduling deterministic in the
|
||||
# online serving setting.
|
||||
|
||||
llm = LLM(model="facebook/opt-125m", seed=SEED)
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
12
examples/template_teleflm.jinja
Normal file
12
examples/template_teleflm.jinja
Normal file
@ -0,0 +1,12 @@
|
||||
{%- for message in messages %}
|
||||
{%- if message['role'] == 'user' %}
|
||||
{{- '<_user>' + message['content']|trim }}
|
||||
{%- elif message['role'] == 'system' %}
|
||||
{{- '<_system>' + message['content']|trim }}
|
||||
{%- elif message['role'] == 'assistant' %}
|
||||
{{- '<_bot>' + message['content'] }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<_bot>' }}
|
||||
{%- endif %}
|
||||
@ -1,8 +0,0 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
|
||||
torch == 2.5.1 # should be aligned with "common" vLLM torch version
|
||||
openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
|
||||
|
||||
optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
|
||||
optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
|
||||
@ -1,10 +1,10 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.2
|
||||
torch==2.5.1
|
||||
torchvision==0.20.1
|
||||
torchaudio==2.5.1
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
|
||||
torch==2.6.0
|
||||
torchvision==0.21.0
|
||||
torchaudio==2.6.0
|
||||
|
||||
cmake>=3.26
|
||||
packaging
|
||||
|
||||
10
setup.py
10
setup.py
@ -449,10 +449,6 @@ def _is_cpu() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "cpu"
|
||||
|
||||
|
||||
def _is_openvino() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "openvino"
|
||||
|
||||
|
||||
def _is_xpu() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "xpu"
|
||||
|
||||
@ -572,8 +568,6 @@ def get_vllm_version() -> str:
|
||||
if gaudi_sw_version != MAIN_CUDA_VERSION:
|
||||
gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
|
||||
version += f"{sep}gaudi{gaudi_sw_version}"
|
||||
elif _is_openvino():
|
||||
version += f"{sep}openvino"
|
||||
elif _is_tpu():
|
||||
version += f"{sep}tpu"
|
||||
elif _is_cpu():
|
||||
@ -623,8 +617,6 @@ def get_requirements() -> list[str]:
|
||||
requirements = _read_requirements("neuron.txt")
|
||||
elif _is_hpu():
|
||||
requirements = _read_requirements("hpu.txt")
|
||||
elif _is_openvino():
|
||||
requirements = _read_requirements("openvino.txt")
|
||||
elif _is_tpu():
|
||||
requirements = _read_requirements("tpu.txt")
|
||||
elif _is_cpu():
|
||||
@ -634,7 +626,7 @@ def get_requirements() -> list[str]:
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
|
||||
"OpenVINO, or CPU.")
|
||||
"or CPU.")
|
||||
return requirements
|
||||
|
||||
|
||||
|
||||
@ -273,8 +273,7 @@ class HfRunner:
|
||||
def get_default_device(self):
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
return ("cpu" if current_platform.is_cpu()
|
||||
or current_platform.is_openvino() else "cuda")
|
||||
return ("cpu" if current_platform.is_cpu() else "cuda")
|
||||
|
||||
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
|
||||
if x is None or isinstance(x, (bool, )):
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from unittest.mock import Mock, patch
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -8,7 +8,6 @@ import torch
|
||||
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
|
||||
from vllm.platforms.cpu import CpuPlatform
|
||||
from vllm.platforms.cuda import CudaPlatform
|
||||
from vllm.platforms.openvino import OpenVinoPlatform
|
||||
from vllm.platforms.rocm import RocmPlatform
|
||||
from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
|
||||
|
||||
@ -21,9 +20,9 @@ def clear_cache():
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
|
||||
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
|
||||
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
|
||||
def test_env(
|
||||
name: str,
|
||||
use_v1: bool,
|
||||
@ -49,15 +48,8 @@ def test_env(
|
||||
RocmPlatform()):
|
||||
backend = get_attn_backend(16, torch.float16, torch.float16,
|
||||
16, False)
|
||||
EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
|
||||
EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
|
||||
assert backend.get_name() == EXPECTED
|
||||
elif device == "openvino":
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
OpenVinoPlatform()), patch.dict('sys.modules',
|
||||
{'openvino': Mock()}):
|
||||
backend = get_attn_backend(16, torch.float16, torch.float16,
|
||||
16, False)
|
||||
assert backend.get_name() == "OPENVINO"
|
||||
else:
|
||||
if name in ["XFORMERS", "FLASHINFER"]:
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
|
||||
@ -26,7 +26,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
|
||||
# Test standard ROCm attention
|
||||
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
|
||||
assert (backend.get_name() == "ROCM_FLASH"
|
||||
or backend.get_name() == "ROCM_ATTN_VLLM_V1")
|
||||
or backend.get_name() == "TRITON_ATTN_VLLM_V1")
|
||||
|
||||
# mla test for deepseek related
|
||||
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
|
||||
|
||||
@ -1,10 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import asyncio
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
import vllm.envs as env
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
@ -13,35 +11,9 @@ from vllm.lora.request import LoRARequest
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import merge_async_iterators
|
||||
|
||||
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
|
||||
LORA_MODULE_DOWNLOAD_PATH = None # Populated by download_and_prepare_lora_module() #noqa
|
||||
LORA_RANK = 8
|
||||
DEFAULT_MAX_LORAS = 16 * 3
|
||||
|
||||
|
||||
def download_and_prepare_lora_module():
|
||||
"""
|
||||
Request submission is expensive when the LoRA adapters have their own
|
||||
tokenizers. This is because, for each request with a new LoRA adapter ID,
|
||||
the front-end loads the tokenizer from disk.
|
||||
|
||||
In this test, as we are comparing request processing times, we want to
|
||||
minimize any extra activity. To this effect, we download the LoRA
|
||||
adapter and remove all the tokenizer files, so the engine will default
|
||||
to the base model tokenizer.
|
||||
"""
|
||||
global LORA_MODULE_DOWNLOAD_PATH
|
||||
|
||||
LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
|
||||
LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)
|
||||
|
||||
tokenizer_files = [
|
||||
'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
|
||||
'tokenizer.model'
|
||||
]
|
||||
for tokenizer_file in tokenizer_files:
|
||||
del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
|
||||
del_path.unlink(missing_ok=True)
|
||||
MODEL_PATH = "THUDM/chatglm3-6b"
|
||||
LORA_RANK = 64
|
||||
DEFAULT_MAX_LORAS = 4 * 3
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@ -52,11 +24,9 @@ def v1(run_with_both_engines_lora):
|
||||
pass
|
||||
|
||||
|
||||
def get_lora_requests() -> list[LoRARequest]:
|
||||
def get_lora_requests(lora_path) -> list[LoRARequest]:
|
||||
lora_requests: list[LoRARequest] = [
|
||||
LoRARequest(lora_name=f"{i}",
|
||||
lora_int_id=i,
|
||||
lora_path=LORA_MODULE_DOWNLOAD_PATH)
|
||||
LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
|
||||
for i in range(1, DEFAULT_MAX_LORAS + 1)
|
||||
]
|
||||
return lora_requests
|
||||
@ -93,7 +63,7 @@ async def requests_processing_time(llm,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_add_lora():
|
||||
async def test_add_lora(chatglm3_lora_files):
|
||||
"""
|
||||
The add_lora function is used to pre-load some LoRA adapters into the
|
||||
engine in anticipation of future requests using these adapters. To test
|
||||
@ -103,10 +73,7 @@ async def test_add_lora():
|
||||
We measure the request processing time in both cases and expect the time
|
||||
to be lesser in the case with add_lora() calls.
|
||||
"""
|
||||
|
||||
download_and_prepare_lora_module()
|
||||
|
||||
lora_requests: list[LoRARequest] = get_lora_requests()
|
||||
lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
|
||||
|
||||
max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
|
||||
# Create engine in eager-mode. Due to high max_loras, the CI can
|
||||
@ -118,6 +85,7 @@ async def test_add_lora():
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=128,
|
||||
gpu_memory_utilization=0.8, #avoid OOM
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True)
|
||||
|
||||
# The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
|
||||
|
||||
@ -84,12 +84,14 @@ def v1(run_with_both_engines_lora):
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora(sql_lora_files):
|
||||
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=1,
|
||||
enable_chunked_prefill=True)
|
||||
llm = vllm.LLM(
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
# also test odd max_num_seqs
|
||||
max_num_seqs=13,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=1,
|
||||
enable_chunked_prefill=True)
|
||||
generate_and_test(llm, sql_lora_files)
|
||||
|
||||
|
||||
|
||||
@ -24,12 +24,10 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
|
||||
)
|
||||
lora_request = LoRARequest("1", 1, sql_lora_files)
|
||||
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
|
||||
request_id="request_id", prompt="prompt", lora_request=lora_request)
|
||||
prompt="prompt", lora_request=lora_request)
|
||||
assert reference_tokenizer.encode(
|
||||
"prompt") == await tokenizer_group.encode_async(
|
||||
request_id="request_id",
|
||||
prompt="prompt",
|
||||
lora_request=lora_request)
|
||||
prompt="prompt", lora_request=lora_request)
|
||||
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
|
||||
PreTrainedTokenizerBase)
|
||||
assert tokenizer_group.get_lora_tokenizer(
|
||||
|
||||
@ -7,7 +7,10 @@ from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.model_executor.layers.activation import (GeluAndMul,
|
||||
ReLUSquaredActivation,
|
||||
SiluAndMul)
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.layernorm import (
|
||||
RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
|
||||
rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
# Registered subclass for test
|
||||
@ -87,3 +90,27 @@ def test_enabled_ops_invalid(env: str):
|
||||
custom_ops=env.split(",")))
|
||||
with set_current_vllm_config(vllm_config):
|
||||
RMSNorm(1024).enabled()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("add_residual", [True, False])
|
||||
@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
|
||||
@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
|
||||
@pytest.mark.skipif(not current_platform.is_rocm(),
|
||||
reason="AITER is a feature exclusive for ROCm")
|
||||
def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
|
||||
use_rocm_aiter_norm: str, monkeypatch):
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
|
||||
rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
|
||||
|
||||
if not add_residual:
|
||||
if current_platform.is_rocm() and int(use_rocm_aiter) and int(
|
||||
use_rocm_aiter_norm):
|
||||
assert rms_norm_func == rocm_aiter_rms_norm
|
||||
else:
|
||||
assert rms_norm_func == rms_norm
|
||||
elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
|
||||
use_rocm_aiter_norm):
|
||||
assert rms_norm_func == rocm_aiter_fused_add_rms_norm
|
||||
else:
|
||||
assert rms_norm_func == fused_add_rms_norm
|
||||
|
||||
@ -3,7 +3,11 @@
|
||||
|
||||
Run `pytest tests/models/test_models.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
@ -13,7 +17,21 @@ from ...utils import check_logprobs_close
|
||||
# https://github.com/vllm-project/vllm/issues/14524
|
||||
REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
|
||||
|
||||
# This list contains the model that are using AITER kernel.
|
||||
# Skip model that are not using AITER tests.
|
||||
# When more AITER kernels are added, this list will not be
|
||||
# needed as all the models will be calling AITER kernels
|
||||
# in parts of the operators
|
||||
AITER_MODEL_LIST = [
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"openbmb/MiniCPM3-4B",
|
||||
"Qwen/Qwen-7B",
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
"ehristoforu/Falcon3-MoE-2x7B-Insruct",
|
||||
]
|
||||
|
||||
|
||||
# @maybe_test_rocm_aiter
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
@ -69,19 +87,24 @@ REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
@pytest.mark.parametrize(
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
|
||||
def test_models(hf_runner, vllm_runner, example_prompts, model: str,
|
||||
dtype: str, max_tokens: int, num_logprobs: int,
|
||||
use_rocm_aiter: bool, monkeypatch) -> None:
|
||||
|
||||
if model in REQUIRES_V0:
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
if use_rocm_aiter and (model in AITER_MODEL_LIST):
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
elif use_rocm_aiter and model not in AITER_MODEL_LIST:
|
||||
# Skip model that are not using AITER tests.
|
||||
# When more AITER kernels are added, this list will not be
|
||||
# needed as all the models will be calling AITER kernels
|
||||
# in parts of the operators
|
||||
pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
if model.startswith("THUDM/chatglm3"):
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
@ -100,3 +123,10 @@ def test_models(
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
if use_rocm_aiter:
|
||||
# this is to ensure that vllm engine
|
||||
# has deallocated the memory before running the next
|
||||
# unit tests. On ROCm, when using AITER
|
||||
# the memory might not be deallocated completely
|
||||
# before running the next test case
|
||||
torch.cuda.synchronize()
|
||||
|
||||
@ -508,6 +508,19 @@ VLM_TEST_SETTINGS = {
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)],
|
||||
),
|
||||
# regression test for https://github.com/vllm-project/vllm/issues/15122
|
||||
"qwen2_5_vl-windows-attention": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)],
|
||||
),
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
|
||||
@ -1,7 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Custom input builders for edge-cases in different models."""
|
||||
from io import BytesIO
|
||||
from typing import Callable
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
@ -102,3 +106,17 @@ def different_patch_input_cases_internvl():
|
||||
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
|
||||
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
|
||||
]
|
||||
|
||||
|
||||
def windows_attention_image_qwen2_5_vl():
|
||||
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122
|
||||
image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
|
||||
image = Image.open(BytesIO(requests.get(image_url).content))
|
||||
|
||||
question = "Describe the image."
|
||||
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
|
||||
return build_single_image_inputs([image], [prompt], wrapped_sf)
|
||||
|
||||
@ -192,6 +192,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
|
||||
"TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
|
||||
trust_remote_code=True),
|
||||
"TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
|
||||
trust_remote_code=True),
|
||||
"XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
|
||||
is_available_online=False,
|
||||
trust_remote_code=True),
|
||||
|
||||
@ -6,7 +6,7 @@ from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.scheduler import Scheduler as V1Scheduler
|
||||
from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
|
||||
from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
|
||||
|
||||
|
||||
|
||||
@ -41,10 +41,10 @@ async def test_tokenizer_group(tokenizer_group_type):
|
||||
max_input_length=None,
|
||||
)
|
||||
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
|
||||
request_id="request_id", prompt="prompt", lora_request=None)
|
||||
prompt="prompt", lora_request=None)
|
||||
assert reference_tokenizer.encode(
|
||||
"prompt") == await tokenizer_group.encode_async(
|
||||
request_id="request_id", prompt="prompt", lora_request=None)
|
||||
"prompt") == await tokenizer_group.encode_async(prompt="prompt",
|
||||
lora_request=None)
|
||||
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
|
||||
PreTrainedTokenizerBase)
|
||||
assert tokenizer_group.get_lora_tokenizer(
|
||||
@ -69,8 +69,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type):
|
||||
# and check that all requests are processed correctly.
|
||||
num_requests = tokenizer_group_pool.pool_size * 5
|
||||
requests = [
|
||||
tokenizer_group_pool.encode_async(request_id=str(i),
|
||||
prompt=f"prompt {i}",
|
||||
tokenizer_group_pool.encode_async(prompt=f"prompt {i}",
|
||||
lora_request=None)
|
||||
for i in range(num_requests)
|
||||
]
|
||||
@ -161,12 +160,8 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
|
||||
fail_at[0] = 1000
|
||||
|
||||
# We should recover successfully.
|
||||
await tokenizer_group_pool.encode_async(request_id="1",
|
||||
prompt="prompt",
|
||||
lora_request=None)
|
||||
await tokenizer_group_pool.encode_async(request_id="1",
|
||||
prompt="prompt",
|
||||
lora_request=None)
|
||||
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
|
||||
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
|
||||
|
||||
# Check that we have a new actor
|
||||
assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
|
||||
@ -184,8 +179,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
|
||||
|
||||
# We should fail after re-initialization.
|
||||
with pytest.raises(RuntimeError):
|
||||
await tokenizer_group_pool.encode_async(request_id="1",
|
||||
prompt="prompt",
|
||||
await tokenizer_group_pool.encode_async(prompt="prompt",
|
||||
lora_request=None)
|
||||
|
||||
# check_health should raise the same thing
|
||||
@ -206,11 +200,8 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
|
||||
|
||||
# Prompt too long error
|
||||
with pytest.raises(ValueError):
|
||||
await tokenizer_group_pool.encode_async(request_id="1",
|
||||
prompt="prompt" * 100,
|
||||
await tokenizer_group_pool.encode_async(prompt="prompt" * 100,
|
||||
lora_request=None)
|
||||
await tokenizer_group_pool.encode_async(request_id="1",
|
||||
prompt="prompt",
|
||||
lora_request=None)
|
||||
await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
|
||||
# Actors should stay the same.
|
||||
assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
|
||||
|
||||
@ -786,7 +786,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
|
||||
without enough resources, or called when filtering tests to run directly.
|
||||
"""
|
||||
try:
|
||||
if current_platform.is_cpu() or current_platform.is_openvino():
|
||||
if current_platform.is_cpu():
|
||||
memory_gb = 0
|
||||
else:
|
||||
memory_gb = current_platform.get_device_total_memory() / GB_bytes
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.inputs import MultiModalKwargs
|
||||
from vllm.sampling_params import SamplingParams
|
||||
@ -8,7 +9,10 @@ from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
|
||||
KVCacheBlock, PrefixCachingMetrics,
|
||||
generate_block_hash_extra_keys,
|
||||
hash_block_tokens,
|
||||
hash_request_tokens)
|
||||
hash_request_tokens,
|
||||
unify_kv_cache_configs)
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec, KVCacheTensor)
|
||||
from vllm.v1.metrics.stats import PrefixCacheStats
|
||||
from vllm.v1.request import Request
|
||||
|
||||
@ -314,3 +318,107 @@ def test_metrics():
|
||||
assert metrics.aggregated_query_total == 0
|
||||
assert metrics.aggregated_query_hit == 0
|
||||
assert not metrics.query_queue
|
||||
|
||||
|
||||
def test_unify_kv_cache_configs():
|
||||
|
||||
def new_kv_cache_spec(block_size=16,
|
||||
num_kv_heads=2,
|
||||
head_size=64,
|
||||
dtype=torch.float32,
|
||||
use_mla=False):
|
||||
return FullAttentionSpec(block_size=block_size,
|
||||
num_kv_heads=num_kv_heads,
|
||||
head_size=head_size,
|
||||
dtype=dtype,
|
||||
use_mla=use_mla)
|
||||
|
||||
same_kv_cache_config = [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
tensors={
|
||||
"layer1": KVCacheTensor(100),
|
||||
"layer2": KVCacheTensor(100),
|
||||
},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=20,
|
||||
tensors={
|
||||
"layer1": KVCacheTensor(100),
|
||||
"layer2": KVCacheTensor(100),
|
||||
},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
],
|
||||
),
|
||||
]
|
||||
unify_kv_cache_configs(same_kv_cache_config)
|
||||
assert same_kv_cache_config[0].num_blocks == 10
|
||||
assert same_kv_cache_config[1].num_blocks == 10
|
||||
|
||||
need_sort_kv_cache_config = [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
tensors={
|
||||
"layer1": KVCacheTensor(100),
|
||||
"layer2": KVCacheTensor(100),
|
||||
},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=20,
|
||||
tensors={
|
||||
"layer1": KVCacheTensor(100),
|
||||
"layer2": KVCacheTensor(100),
|
||||
},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
unify_kv_cache_configs(need_sort_kv_cache_config)
|
||||
assert need_sort_kv_cache_config[0].num_blocks == 10
|
||||
assert need_sort_kv_cache_config[1].num_blocks == 10
|
||||
|
||||
diff_kv_cache_config = [
|
||||
KVCacheConfig(
|
||||
num_blocks=10,
|
||||
tensors={
|
||||
"layer1": KVCacheTensor(100),
|
||||
"layer2": KVCacheTensor(100),
|
||||
},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=4)),
|
||||
],
|
||||
),
|
||||
KVCacheConfig(
|
||||
num_blocks=20,
|
||||
tensors={
|
||||
"layer1": KVCacheTensor(100),
|
||||
"layer2": KVCacheTensor(100),
|
||||
},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
|
||||
KVCacheGroupSpec(["layer2"],
|
||||
new_kv_cache_spec(num_kv_heads=8)),
|
||||
],
|
||||
),
|
||||
]
|
||||
with pytest.raises(AssertionError):
|
||||
unify_kv_cache_configs(diff_kv_cache_config)
|
||||
|
||||
@ -6,7 +6,8 @@ import pytest
|
||||
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
@ -57,6 +57,50 @@ def test_guided_json_completion(
|
||||
jsonschema.validate(instance=output_json, schema=sample_json_schema)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
GUIDED_DECODING_BACKENDS_V1)
|
||||
@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
|
||||
def test_guided_json_completion_disable_any_whitespace(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
sample_json_schema: dict[str, Any],
|
||||
guided_decoding_backend: str,
|
||||
model_name: str,
|
||||
):
|
||||
if guided_decoding_backend != "xgrammar":
|
||||
pytest.skip("disable-any-whitespace is only supported for xgrammar.")
|
||||
guided_decoding_backend = 'xgrammar:disable-any-whitespace'
|
||||
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=1024,
|
||||
guided_decoding_backend=guided_decoding_backend)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
|
||||
outputs = llm.generate(prompts=[
|
||||
f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {sample_json_schema}"
|
||||
] * 2,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
|
||||
assert outputs is not None
|
||||
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
assert isinstance(output, RequestOutput)
|
||||
prompt = output.prompt
|
||||
|
||||
generated_text = output.outputs[0].text
|
||||
assert generated_text is not None
|
||||
assert "\n" not in generated_text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
output_json = json.loads(generated_text)
|
||||
jsonschema.validate(instance=output_json, schema=sample_json_schema)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize("guided_decoding_backend",
|
||||
GUIDED_DECODING_BACKENDS_V1)
|
||||
@ -301,7 +345,6 @@ def test_guided_choice_completion(
|
||||
prompts="The best language for type-safe systems programming is ",
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
|
||||
assert outputs is not None
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
|
||||
109
tests/v1/tpu/test_mha_attn.py
Normal file
109
tests/v1/tpu/test_mha_attn.py
Normal file
@ -0,0 +1,109 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
Test:
|
||||
|
||||
* Tests for MultiHeadAttention layer
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch_xla
|
||||
import torch_xla.core
|
||||
import torch_xla.core.xla_model
|
||||
|
||||
from vllm import envs
|
||||
from vllm.attention.layer import MultiHeadAttention
|
||||
from vllm.attention.selector import _cached_get_attn_backend
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if not envs.VLLM_USE_V1:
|
||||
pytest.skip(
|
||||
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
|
||||
allow_module_level=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clear_cache():
|
||||
"""Clear lru cache to ensure each test case runs without caching.
|
||||
"""
|
||||
_cached_get_attn_backend.cache_clear()
|
||||
|
||||
|
||||
def ref_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
scale: float,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Native implementation of scaled dot product attention without mask:
|
||||
- query, key, value: [batch_size, seq_len, num_heads, head_size]
|
||||
- attn_mask: [batch_size, seq_len, seq_len]
|
||||
"""
|
||||
query, key, value = (x.transpose(1, 2) for x in (query, key, value))
|
||||
attn_weights = scale * torch.matmul(query, key.transpose(2, 3))
|
||||
attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
|
||||
out = torch.matmul(attn_weights, value).transpose(1, 2)
|
||||
return out
|
||||
|
||||
|
||||
BATCH_SIZES = [1, 16]
|
||||
SEQ_LENS = [1]
|
||||
NUM_HEADS = [1, 16]
|
||||
NUM_KV_HEADS = [1]
|
||||
HEAD_SIZES = [64, 80]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_tpu(),
|
||||
reason="This test needs a TPU")
|
||||
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
|
||||
@pytest.mark.parametrize("seq_len", SEQ_LENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("device", [torch_xla.core.xla_model.xla_device()])
|
||||
def test_mha_attn_forward(
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
device: str,
|
||||
):
|
||||
current_platform.seed_everything(0)
|
||||
# These are expected to be f32
|
||||
q = torch.randn(batch_size, seq_len, num_heads * head_size, device=device)
|
||||
k = torch.randn(batch_size,
|
||||
seq_len,
|
||||
num_kv_heads * head_size,
|
||||
device=device)
|
||||
v = torch.randn(batch_size,
|
||||
seq_len,
|
||||
num_kv_heads * head_size,
|
||||
device=device)
|
||||
scale = 1.0 / head_size**0.5
|
||||
attn = MultiHeadAttention(num_heads,
|
||||
head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads)
|
||||
output = attn(q, k, v)
|
||||
|
||||
assert num_heads % num_kv_heads == 0
|
||||
num_queries_per_kv = num_heads // num_kv_heads
|
||||
|
||||
q = q.reshape(batch_size, seq_len, num_heads, head_size)
|
||||
k = k.reshape(batch_size, seq_len, num_kv_heads, head_size)
|
||||
v = v.reshape(batch_size, seq_len, num_kv_heads, head_size)
|
||||
if num_queries_per_kv > 1:
|
||||
k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
|
||||
v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
|
||||
|
||||
ref_output = ref_attention(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
scale=scale,
|
||||
).reshape(batch_size, seq_len, num_heads * head_size)
|
||||
# torch_xla flash_attn kernel is less accurate but much faster
|
||||
torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-3)
|
||||
@ -39,7 +39,7 @@ def test_sampler_compilation(model_name: str, monkeypatch):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.7,
|
||||
# top_p=0.6, # TODO too slow!
|
||||
# top_k=10,
|
||||
top_k=10,
|
||||
min_p=0.2,
|
||||
max_tokens=16)
|
||||
s = time()
|
||||
@ -49,6 +49,7 @@ def test_sampler_compilation(model_name: str, monkeypatch):
|
||||
# Second request with different params, but for which we
|
||||
# compiled for in previous eager iteration.
|
||||
sampling_params = SamplingParams(temperature=0.1,
|
||||
top_k=12,
|
||||
min_p=0.8,
|
||||
max_tokens=24)
|
||||
s = time()
|
||||
|
||||
@ -3,8 +3,8 @@ import pytest
|
||||
|
||||
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
|
||||
SchedulerOutput)
|
||||
from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
|
||||
SchedulerOutput)
|
||||
from vllm.v1.sample.metadata import SamplingMetadata
|
||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
||||
|
||||
|
||||
@ -630,7 +630,8 @@ class FlashAttentionImpl(AttentionImpl):
|
||||
self.sliding_window = ((sliding_window - 1,
|
||||
0) if sliding_window is not None else (-1, -1))
|
||||
self.kv_cache_dtype = kv_cache_dtype
|
||||
self.vllm_flash_attn_version = get_flash_attn_version()
|
||||
self.vllm_flash_attn_version = get_flash_attn_version(
|
||||
requires_alibi=self.alibi_slopes is not None)
|
||||
if (is_quantized_kv_cache(self.kv_cache_dtype)
|
||||
and self.vllm_flash_attn_version != 3):
|
||||
raise NotImplementedError(
|
||||
|
||||
@ -1,146 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Tuple, Type
|
||||
|
||||
import openvino as ov
|
||||
import torch
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend,
|
||||
AttentionMetadata)
|
||||
from vllm.attention.backends.utils import CommonAttentionState
|
||||
from vllm.multimodal import MultiModalPlaceholderMap
|
||||
|
||||
|
||||
def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor,
|
||||
src_offset: int, dst_offset: int) -> None:
|
||||
|
||||
def create_roi_tensor(
|
||||
tensor: ov.Tensor,
|
||||
block_number: int,
|
||||
) -> ov.Tensor:
|
||||
roi_begin = ov.runtime.Coordinate([0, 0, 0, 0])
|
||||
roi_end = ov.runtime.Coordinate(tensor.get_shape())
|
||||
|
||||
roi_begin[0] = block_number
|
||||
roi_end[0] = block_number + 1
|
||||
|
||||
if isinstance(tensor, ov.Tensor):
|
||||
return ov.Tensor(tensor, roi_begin, roi_end)
|
||||
else:
|
||||
return ov.RemoteTensor(tensor, roi_begin, roi_end)
|
||||
|
||||
src_roi_tensor = \
|
||||
create_roi_tensor(src_tensor, src_offset)
|
||||
dst_roi_tensor = \
|
||||
create_roi_tensor(dst_tensor, dst_offset)
|
||||
src_roi_tensor.copy_to(dst_roi_tensor)
|
||||
|
||||
|
||||
class OpenVINOAttentionBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "OPENVINO"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls():
|
||||
# OpenVINO implements PagedAttention as part of the Optimum
|
||||
# exported model
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def make_metadata(*args, **kwargs) -> "AttentionMetadata":
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def get_state_cls() -> Type["CommonAttentionState"]:
|
||||
return CommonAttentionState
|
||||
|
||||
@staticmethod
|
||||
def make_openvino_metadata(*args, **kwargs) -> "OpenVINOAttentionMetadata":
|
||||
return OpenVINOAttentionMetadata(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
return (2, num_blocks, num_kv_heads, block_size, head_size)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(
|
||||
src_tensor: ov.Tensor,
|
||||
dst_tensor: ov.Tensor,
|
||||
src_to_dists: List[Tuple[int, int]],
|
||||
) -> None:
|
||||
for src, dst in src_to_dists:
|
||||
copy_cache_block(src_tensor, dst_tensor, src, dst)
|
||||
|
||||
@staticmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
|
||||
src_to_dists: List[Tuple[int, int]],
|
||||
) -> None:
|
||||
for src, dst in src_to_dists:
|
||||
for key_cache, value_cache in kv_caches:
|
||||
copy_cache_block(key_cache, key_cache, src, dst)
|
||||
copy_cache_block(value_cache, value_cache, src, dst)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenVINOAttentionMetadata:
|
||||
"""Metadata for OpenVINOAttentionBackend.
|
||||
|
||||
Basic terms used below:
|
||||
- batch_size_in_sequences - total number of sequences to execute
|
||||
- prompt_lens – per sequence size number of scheduled tokens
|
||||
- batch_size_in_tokens = sum(prompt_lens)
|
||||
- max_context_len = max(context_lens)
|
||||
- max_num_blocks = div_up(max_context_len / BLOCK_SIZE)
|
||||
- num_blocks – total number of blocks in block_indices
|
||||
"""
|
||||
|
||||
# Describes past KV cache size for each sequence within a batch
|
||||
# Shape: [batch_size_in_sequences]
|
||||
# Type: i32
|
||||
past_lens: torch.Tensor
|
||||
|
||||
# Describes start indices of input / speculative tokens from
|
||||
# current sequences within a batch sequence
|
||||
# Shape: [batch_size_in_sequences + 1]
|
||||
# Type: i32
|
||||
subsequence_begins: torch.Tensor
|
||||
|
||||
# Describes block tables for each sequence within a batch -
|
||||
# indices along 0th dimension in key_cache and value_cache inputs
|
||||
# Shape: [num_blocks]
|
||||
# Type: i32
|
||||
block_indices: torch.Tensor
|
||||
|
||||
# Describes block tables for each sequence within a batch -
|
||||
# for i-th element, it is an index in block_indices with the
|
||||
# first block belonging to i-th sequence
|
||||
# Shape: [batch_size_in_sequences + 1]
|
||||
# Type: i32
|
||||
block_indices_begins: torch.Tensor
|
||||
|
||||
# Describes max context length
|
||||
# Shape: scalar
|
||||
# Type: i32
|
||||
max_context_len: torch.Tensor
|
||||
|
||||
# The index maps that relate multi-modal embeddings to the corresponding
|
||||
# placeholders.
|
||||
#
|
||||
# N.B. These aren't really related to attention and don't belong on this
|
||||
# type -- this is just a temporary solution to make them available to
|
||||
# `model_executable`.
|
||||
multi_modal_placeholder_index_maps: Optional[Dict[
|
||||
str, MultiModalPlaceholderMap.IndexMap]]
|
||||
|
||||
# Enable/disable KV scales calculation. This is so that we can disable the
|
||||
# calculation until after prefill and cuda graph capture.
|
||||
enable_kv_scales_calculation: bool
|
||||
@ -281,8 +281,7 @@ class MultiHeadAttention(nn.Module):
|
||||
backend = _Backend.XFORMERS
|
||||
|
||||
self.attn_backend = backend if backend in {
|
||||
_Backend.TORCH_SDPA,
|
||||
_Backend.XFORMERS,
|
||||
_Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1
|
||||
} else _Backend.TORCH_SDPA
|
||||
|
||||
def forward(
|
||||
@ -320,6 +319,13 @@ class MultiHeadAttention(nn.Module):
|
||||
value,
|
||||
scale=self.scale)
|
||||
out = out.transpose(1, 2)
|
||||
elif self.attn_backend == _Backend.PALLAS_VLLM_V1:
|
||||
query, key, value = (x.transpose(1, 2)
|
||||
for x in (query, key, value))
|
||||
from torch_xla.experimental.custom_kernel import flash_attention
|
||||
out = flash_attention(query, key, value, sm_scale=self.scale)
|
||||
out = out.transpose(1, 2)
|
||||
|
||||
return out.reshape(bsz, q_len, -1)
|
||||
|
||||
|
||||
|
||||
@ -399,6 +399,7 @@ class VllmBackend:
|
||||
rank = vllm_config.parallel_config.rank
|
||||
dp_rank = vllm_config.parallel_config.data_parallel_rank
|
||||
local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}")
|
||||
os.makedirs(local_cache_dir, exist_ok=True)
|
||||
self.compilation_config.local_cache_dir = local_cache_dir
|
||||
|
||||
disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE
|
||||
|
||||
@ -1294,6 +1294,12 @@ class LoadConfig:
|
||||
"tensorizer" will use CoreWeave's tensorizer library for
|
||||
fast weight loading.
|
||||
"bitsandbytes" will load nf4 type weights.
|
||||
"sharded_state" will load weights from pre-sharded checkpoint files,
|
||||
supporting efficient loading of tensor-parallel models.
|
||||
"gguf" will load weights from GGUF format files.
|
||||
"mistral" will load weights from consolidated safetensors files used
|
||||
by Mistral models.
|
||||
"runai_streamer" will load weights from RunAI streamer format files.
|
||||
model_loader_extra_config: The extra config for the model loader.
|
||||
ignore_patterns: The list of patterns to ignore when loading the model.
|
||||
Default to "original/**/*" to avoid repeated loading of llama's
|
||||
@ -1795,7 +1801,7 @@ class DeviceConfig:
|
||||
self.device_type = device
|
||||
|
||||
# Some device types require processing inputs on CPU
|
||||
if self.device_type in ["neuron", "openvino"]:
|
||||
if self.device_type in ["neuron"]:
|
||||
self.device = torch.device("cpu")
|
||||
elif self.device_type in ["tpu"]:
|
||||
self.device = None
|
||||
|
||||
@ -26,7 +26,7 @@ from vllm.plugins import load_general_plugins
|
||||
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import FlexibleArgumentParser, StoreBoolean
|
||||
from vllm.utils import FlexibleArgumentParser, StoreBoolean, is_in_ray_actor
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
|
||||
@ -40,7 +40,6 @@ DEVICE_OPTIONS = [
|
||||
"cuda",
|
||||
"neuron",
|
||||
"cpu",
|
||||
"openvino",
|
||||
"tpu",
|
||||
"xpu",
|
||||
"hpu",
|
||||
@ -339,9 +338,15 @@ class EngineArgs:
|
||||
'CoreWeave. See the Tensorize vLLM Model script in the Examples '
|
||||
'section for more information.\n'
|
||||
'* "runai_streamer" will load the Safetensors weights using Run:ai'
|
||||
'Model Streamer \n'
|
||||
'Model Streamer.\n'
|
||||
'* "bitsandbytes" will load the weights using bitsandbytes '
|
||||
'quantization.\n')
|
||||
'quantization.\n'
|
||||
'* "sharded_state" will load weights from pre-sharded checkpoint '
|
||||
'files, supporting efficient loading of tensor-parallel models\n'
|
||||
'* "gguf" will load weights from GGUF format files (details '
|
||||
'specified in https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n'
|
||||
'* "mistral" will load weights from consolidated safetensors files '
|
||||
'used by Mistral models.\n')
|
||||
parser.add_argument(
|
||||
'--config-format',
|
||||
default=EngineArgs.config_format,
|
||||
@ -1170,22 +1175,15 @@ class EngineArgs:
|
||||
)
|
||||
|
||||
def create_load_config(self) -> LoadConfig:
|
||||
# bitsandbytes quantization needs a specific model loader
|
||||
# so we make sure the quant method and the load format are consistent
|
||||
if (self.quantization == "bitsandbytes" or
|
||||
self.qlora_adapter_name_or_path is not None) and \
|
||||
self.load_format != "bitsandbytes":
|
||||
raise ValueError(
|
||||
"BitsAndBytes quantization and QLoRA adapter only support "
|
||||
f"'bitsandbytes' load format, but got {self.load_format}")
|
||||
|
||||
if (self.load_format == "bitsandbytes" or
|
||||
self.qlora_adapter_name_or_path is not None) and \
|
||||
if(self.qlora_adapter_name_or_path is not None) and \
|
||||
self.quantization != "bitsandbytes":
|
||||
raise ValueError(
|
||||
"BitsAndBytes load format and QLoRA adapter only support "
|
||||
"QLoRA adapter only support "
|
||||
f"'bitsandbytes' quantization, but got {self.quantization}")
|
||||
|
||||
if self.quantization == "bitsandbytes":
|
||||
self.load_format = "bitsandbytes"
|
||||
return LoadConfig(
|
||||
load_format=self.load_format,
|
||||
download_dir=self.download_dir,
|
||||
@ -1252,6 +1250,18 @@ class EngineArgs:
|
||||
cpu_offload_gb=self.cpu_offload_gb,
|
||||
calculate_kv_scales=self.calculate_kv_scales,
|
||||
)
|
||||
|
||||
# Get the current placement group if Ray is initialized and
|
||||
# we are in a Ray actor. If so, then the placement group will be
|
||||
# passed to spawned processes.
|
||||
placement_group = None
|
||||
if is_in_ray_actor():
|
||||
import ray
|
||||
|
||||
# This call initializes Ray automatically if it is not initialized,
|
||||
# but we should not do this here.
|
||||
placement_group = ray.util.get_current_placement_group()
|
||||
|
||||
parallel_config = ParallelConfig(
|
||||
pipeline_parallel_size=self.pipeline_parallel_size,
|
||||
tensor_parallel_size=self.tensor_parallel_size,
|
||||
@ -1264,6 +1274,7 @@ class EngineArgs:
|
||||
self.tokenizer_pool_extra_config,
|
||||
),
|
||||
ray_workers_use_nsight=self.ray_workers_use_nsight,
|
||||
placement_group=placement_group,
|
||||
distributed_executor_backend=self.distributed_executor_backend,
|
||||
worker_cls=self.worker_cls,
|
||||
worker_extension_cls=self.worker_extension_cls,
|
||||
@ -1474,7 +1485,9 @@ class EngineArgs:
|
||||
return False
|
||||
|
||||
# Only support Xgrammar for guided decoding so far.
|
||||
SUPPORTED_GUIDED_DECODING = ["xgrammar", "xgrammar:nofallback"]
|
||||
SUPPORTED_GUIDED_DECODING = [
|
||||
"xgrammar", "xgrammar:disable-any-whitespace"
|
||||
]
|
||||
if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
|
||||
_raise_or_fallback(feature_name="--guided-decoding-backend",
|
||||
recommend_to_remove=False)
|
||||
@ -1582,7 +1595,7 @@ class EngineArgs:
|
||||
# No FlashInfer or XFormers so far.
|
||||
V1_BACKENDS = [
|
||||
"FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1",
|
||||
"TRITON_MLA", "FLASHMLA"
|
||||
"TRITON_ATTN_VLLM_V1", "TRITON_MLA", "FLASHMLA"
|
||||
]
|
||||
if (envs.is_set("VLLM_ATTENTION_BACKEND")
|
||||
and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
|
||||
@ -1695,7 +1708,7 @@ class EngineArgs:
|
||||
# V1 should use the new scheduler by default.
|
||||
# Swap it only if this arg is set to the original V0 default
|
||||
if self.scheduler_cls == EngineArgs.scheduler_cls:
|
||||
self.scheduler_cls = "vllm.v1.core.scheduler.Scheduler"
|
||||
self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"
|
||||
|
||||
# When no user override, set the default values based on the usage
|
||||
# context.
|
||||
|
||||
@ -492,7 +492,6 @@ class _AsyncLLMEngine(LLMEngine):
|
||||
|
||||
preprocessed_inputs = await self.input_preprocessor.preprocess_async(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
@ -783,7 +783,6 @@ class LLMEngine:
|
||||
|
||||
preprocessed_inputs = self.input_preprocessor.preprocess(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
)
|
||||
|
||||
@ -81,10 +81,7 @@ class EngineClient(ABC):
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
processed_inputs = preprocessor._prompt_to_llm_inputs(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
)
|
||||
processed_inputs = preprocessor._prompt_to_llm_inputs(prompt)
|
||||
|
||||
prompt_token_ids = processed_inputs["prompt_token_ids"]
|
||||
prompt_text = processed_inputs.get("prompt")
|
||||
|
||||
52
vllm/envs.py
52
vllm/envs.py
@ -40,11 +40,8 @@ if TYPE_CHECKING:
|
||||
VLLM_CPU_KVCACHE_SPACE: int = 0
|
||||
VLLM_CPU_OMP_THREADS_BIND: str = ""
|
||||
VLLM_CPU_MOE_PREPACK: bool = True
|
||||
VLLM_OPENVINO_DEVICE: str = "CPU"
|
||||
VLLM_OPENVINO_KVCACHE_SPACE: int = 0
|
||||
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
|
||||
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
|
||||
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
|
||||
VLLM_XLA_CHECK_RECOMPILATION: bool = False
|
||||
VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
|
||||
VLLM_USE_RAY_SPMD_WORKER: bool = False
|
||||
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
||||
@ -74,6 +71,8 @@ if TYPE_CHECKING:
|
||||
VLLM_SKIP_P2P_CHECK: bool = False
|
||||
VLLM_DISABLED_KERNELS: list[str] = []
|
||||
VLLM_USE_V1: bool = True
|
||||
VLLM_ROCM_USE_AITER: bool = False
|
||||
VLLM_ROCM_USE_AITER_RMSNORM: bool = True
|
||||
VLLM_ROCM_FP8_PADDING: bool = True
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
|
||||
VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
|
||||
@ -95,6 +94,7 @@ if TYPE_CHECKING:
|
||||
VLLM_DP_MASTER_PORT: int = 0
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
|
||||
VLLM_V0_USE_OUTLINES_CACHE: bool = False
|
||||
VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@ -127,7 +127,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# ================== Installation Time Env Vars ==================
|
||||
|
||||
# Target device of vLLM, supporting [cuda (by default),
|
||||
# rocm, neuron, cpu, openvino]
|
||||
# rocm, neuron, cpu]
|
||||
"VLLM_TARGET_DEVICE":
|
||||
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
|
||||
|
||||
@ -354,28 +354,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_CPU_MOE_PREPACK":
|
||||
lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))),
|
||||
|
||||
# OpenVINO device selection
|
||||
# default is CPU
|
||||
"VLLM_OPENVINO_DEVICE":
|
||||
lambda: os.getenv("VLLM_OPENVINO_DEVICE", "CPU").upper(),
|
||||
|
||||
# OpenVINO key-value cache space
|
||||
# default is 4GB
|
||||
"VLLM_OPENVINO_KVCACHE_SPACE":
|
||||
lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),
|
||||
|
||||
# OpenVINO KV cache precision
|
||||
# default is bf16 if natively supported by platform, otherwise f16
|
||||
# To enable KV cache compression, please, explicitly specify u8
|
||||
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":
|
||||
lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),
|
||||
|
||||
# Enables weights compression during model export via HF Optimum
|
||||
# default is False
|
||||
"VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
|
||||
lambda:
|
||||
(os.environ.get("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", "0").lower() in
|
||||
("on", "true", "1")),
|
||||
# If the env var is set, then all workers will execute as separate
|
||||
# processes from the engine, and we use the same mechanism to trigger
|
||||
# execution on all workers.
|
||||
@ -445,6 +423,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_XLA_CACHE_PATH",
|
||||
os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
|
||||
)),
|
||||
|
||||
# If set, assert on XLA recompilation after each execution step.
|
||||
"VLLM_XLA_CHECK_RECOMPILATION":
|
||||
lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))),
|
||||
"VLLM_FUSED_MOE_CHUNK_SIZE":
|
||||
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
|
||||
|
||||
@ -522,6 +504,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_USE_V1":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
|
||||
|
||||
# Disable aiter ops unless specifically enabled.
|
||||
# Acts as a parent switch to enable the rest of the other operations.
|
||||
"VLLM_ROCM_USE_AITER":
|
||||
lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
|
||||
("true", "1")),
|
||||
|
||||
# use aiter rms norm op if aiter ops are enabled.
|
||||
"VLLM_ROCM_USE_AITER_RMSNORM":
|
||||
lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
|
||||
("true", "1")),
|
||||
|
||||
# Pad the fp8 weights to 256 bytes for ROCm
|
||||
"VLLM_ROCM_FP8_PADDING":
|
||||
lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
|
||||
@ -623,6 +616,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# an environment with potentially malicious users.
|
||||
"VLLM_V0_USE_OUTLINES_CACHE":
|
||||
lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
|
||||
|
||||
# If set, disables TPU-specific optimization for top-k & top-p sampling
|
||||
"VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION":
|
||||
lambda: bool(int(os.environ["VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION"]))
|
||||
if "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION" in os.environ else None,
|
||||
}
|
||||
|
||||
# end-env-vars-definition
|
||||
|
||||
@ -16,7 +16,7 @@ import torch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import _check_multiproc_method, get_mp_context, run_method
|
||||
from vllm.utils import _maybe_force_spawn, get_mp_context, run_method
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -291,7 +291,7 @@ def set_multiprocessing_worker_envs(parallel_config):
|
||||
in a multiprocessing environment. This should be called by the parent
|
||||
process before worker processes are created"""
|
||||
|
||||
_check_multiproc_method()
|
||||
_maybe_force_spawn()
|
||||
|
||||
# Configure thread parallelism if OMP_NUM_THREADS isn't set
|
||||
#
|
||||
|
||||
@ -561,6 +561,15 @@ class RayDistributedExecutor(DistributedExecutorBase):
|
||||
envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
|
||||
logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
|
||||
envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
|
||||
# Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
|
||||
# (it is 10 seconds by default). This is a Ray environment variable to
|
||||
# control the timeout of getting result from a compiled graph execution,
|
||||
# i.e., the distributed execution that includes model forward runs and
|
||||
# intermediate tensor communications, in the case of vllm.
|
||||
os.environ.setdefault("RAY_CGRAPH_get_timeout", "300") # noqa: SIM112
|
||||
logger.info("RAY_CGRAPH_get_timeout is set to %s",
|
||||
os.environ["RAY_CGRAPH_get_timeout"]) # noqa: SIM112
|
||||
|
||||
with InputNode() as input_data:
|
||||
# Example DAG: PP=2, TP=4
|
||||
#
|
||||
|
||||
@ -17,7 +17,7 @@ from vllm.utils import get_ip
|
||||
from vllm.worker.worker_base import WorkerWrapperBase
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.v1.core.scheduler import SchedulerOutput
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -284,8 +284,9 @@ def initialize_ray_cluster(
|
||||
assert_ray_available()
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# Connect to a ray cluster.
|
||||
if current_platform.is_rocm() or current_platform.is_xpu():
|
||||
if ray.is_initialized():
|
||||
logger.info("Ray is already initialized. Skipping Ray initialization.")
|
||||
elif current_platform.is_rocm() or current_platform.is_xpu():
|
||||
# Try to connect existing ray instance and create a new one if not found
|
||||
try:
|
||||
ray.init("auto", ignore_reinit_error=True)
|
||||
@ -299,19 +300,21 @@ def initialize_ray_cluster(
|
||||
else:
|
||||
ray.init(address=ray_address, ignore_reinit_error=True)
|
||||
|
||||
if parallel_config.placement_group:
|
||||
# Placement group is already set.
|
||||
return
|
||||
|
||||
device_str = current_platform.ray_device_key
|
||||
if not device_str:
|
||||
raise ValueError(
|
||||
f"current platform {current_platform.device_name} does not "
|
||||
"support ray.")
|
||||
|
||||
# Create placement group for worker processes
|
||||
current_placement_group = ray.util.get_current_placement_group()
|
||||
# Create or get the placement group for worker processes
|
||||
if parallel_config.placement_group:
|
||||
current_placement_group = parallel_config.placement_group
|
||||
else:
|
||||
current_placement_group = ray.util.get_current_placement_group()
|
||||
|
||||
if current_placement_group:
|
||||
logger.info("Using the existing placement group")
|
||||
|
||||
# We are in a placement group
|
||||
bundles = current_placement_group.bundle_specs
|
||||
# Verify that we can use the placement group.
|
||||
@ -331,6 +334,8 @@ def initialize_ray_cluster(
|
||||
f"Required number of devices: {parallel_config.world_size}. "
|
||||
f"Total number of devices: {device_bundles}.")
|
||||
else:
|
||||
logger.info("No current placement group found. "
|
||||
"Creating a new placement group.")
|
||||
num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
|
||||
# Log a warning message and delay resource allocation failure response.
|
||||
# Avoid immediate rejection to allow user-initiated placement group
|
||||
|
||||
@ -7,7 +7,7 @@ from vllm.logger import init_logger
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def get_flash_attn_version() -> Optional[int]:
|
||||
def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]:
|
||||
# import here to avoid circular dependencies
|
||||
from vllm.platforms import current_platform
|
||||
try:
|
||||
@ -28,8 +28,14 @@ def get_flash_attn_version() -> Optional[int]:
|
||||
|
||||
# 3. fallback for unsupported combinations
|
||||
if device_capability.major == 10 and fa_version == 3:
|
||||
logger.warning("Cannot use FA version 3 on Blackwell platform",
|
||||
"defaulting to FA version 2.")
|
||||
logger.warning_once(
|
||||
"Cannot use FA version 3 on Blackwell platform "
|
||||
"defaulting to FA version 2.")
|
||||
fa_version = 2
|
||||
|
||||
if requires_alibi and fa_version == 3:
|
||||
logger.warning_once("Cannot use FA version 3 with ALiBi, "
|
||||
"defaulting to FA version 2.")
|
||||
fa_version = 2
|
||||
|
||||
if not is_fa_version_supported(fa_version):
|
||||
|
||||
@ -182,7 +182,6 @@ class InputPreprocessor:
|
||||
def _tokenize_prompt(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest],
|
||||
) -> list[int]:
|
||||
"""
|
||||
@ -202,15 +201,13 @@ class InputPreprocessor:
|
||||
"do_lower_case", False)):
|
||||
prompt = prompt.lower()
|
||||
|
||||
return tokenizer.encode(request_id=request_id,
|
||||
prompt=prompt,
|
||||
return tokenizer.encode(prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens)
|
||||
|
||||
async def _tokenize_prompt_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest],
|
||||
) -> list[int]:
|
||||
"""Async version of :meth:`_tokenize_prompt`."""
|
||||
@ -222,7 +219,6 @@ class InputPreprocessor:
|
||||
# appending an EOS token to the prompt which disrupts generation.
|
||||
add_special_tokens = False
|
||||
return await tokenizer.encode_async(
|
||||
request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens)
|
||||
@ -309,7 +305,6 @@ class InputPreprocessor:
|
||||
def _prompt_to_llm_inputs(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> SingletonInputs:
|
||||
@ -318,7 +313,6 @@ class InputPreprocessor:
|
||||
|
||||
Arguments:
|
||||
|
||||
* request_id
|
||||
* prompt: single encoder or decoder input prompt
|
||||
* lora_request: this is only valid for decoder prompts
|
||||
* return_mm_hashes: whether to return multimodal hashes
|
||||
@ -333,7 +327,6 @@ class InputPreprocessor:
|
||||
prompt_text = parsed["content"]
|
||||
prompt_token_ids = self._tokenize_prompt(
|
||||
prompt_text,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
@ -384,7 +377,6 @@ class InputPreprocessor:
|
||||
|
||||
prompt_token_ids = self._tokenize_prompt(
|
||||
prompt_text,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
@ -400,7 +392,6 @@ class InputPreprocessor:
|
||||
async def _prompt_to_llm_inputs_async(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> SingletonInputs:
|
||||
@ -411,7 +402,6 @@ class InputPreprocessor:
|
||||
prompt_text = parsed["content"]
|
||||
prompt_token_ids = await self._tokenize_prompt_async(
|
||||
prompt_text,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
@ -460,7 +450,6 @@ class InputPreprocessor:
|
||||
|
||||
prompt_token_ids = await self._tokenize_prompt_async(
|
||||
prompt_text,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
@ -560,7 +549,6 @@ class InputPreprocessor:
|
||||
def _process_encoder_decoder_prompt(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
request_id: str,
|
||||
) -> EncoderDecoderInputs:
|
||||
"""
|
||||
For encoder/decoder models only:
|
||||
@ -587,7 +575,6 @@ class InputPreprocessor:
|
||||
Arguments:
|
||||
|
||||
* prompt: an input prompt
|
||||
* request_id
|
||||
|
||||
Returns:
|
||||
|
||||
@ -598,16 +585,11 @@ class InputPreprocessor:
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
encoder_inputs = self._prompt_to_llm_inputs(
|
||||
prompt["encoder_prompt"],
|
||||
request_id=request_id,
|
||||
)
|
||||
prompt["encoder_prompt"])
|
||||
if (decoder_input := prompt["decoder_prompt"]) is None:
|
||||
decoder_inputs = None
|
||||
else:
|
||||
decoder_inputs = self._prompt_to_llm_inputs(
|
||||
decoder_input,
|
||||
request_id=request_id,
|
||||
)
|
||||
decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
|
||||
# For multimodal model, override decoder prompt from processor
|
||||
# with explicit decoder prompt.
|
||||
if self.model_config.is_multimodal_model and (
|
||||
@ -616,10 +598,7 @@ class InputPreprocessor:
|
||||
self._separate_enc_dec_inputs_from_mm_processor_outputs(
|
||||
encoder_inputs, decoder_inputs))
|
||||
else:
|
||||
inputs = self._prompt_to_llm_inputs(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
)
|
||||
inputs = self._prompt_to_llm_inputs(prompt)
|
||||
if self.model_config.is_multimodal_model and (
|
||||
self._can_process_multimodal()):
|
||||
# Encoder-Decoder Multimodal model
|
||||
@ -636,7 +615,6 @@ class InputPreprocessor:
|
||||
async def _process_encoder_decoder_prompt_async(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
request_id: str,
|
||||
) -> EncoderDecoderInputs:
|
||||
"""Async version of :meth:`_process_encoder_decoder_prompt`."""
|
||||
encoder_inputs: SingletonInputs
|
||||
@ -644,18 +622,13 @@ class InputPreprocessor:
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
encoder_task = self._prompt_to_llm_inputs_async(
|
||||
prompt["encoder_prompt"],
|
||||
request_id=request_id,
|
||||
)
|
||||
prompt["encoder_prompt"])
|
||||
|
||||
if (decoder_input := prompt["decoder_prompt"]) is None:
|
||||
encoder_inputs = await encoder_task
|
||||
decoder_inputs = None
|
||||
else:
|
||||
decoder_task = self._prompt_to_llm_inputs_async(
|
||||
decoder_input,
|
||||
request_id=request_id,
|
||||
)
|
||||
decoder_task = self._prompt_to_llm_inputs_async(decoder_input)
|
||||
|
||||
encoder_inputs, decoder_inputs = await asyncio.gather(
|
||||
encoder_task, decoder_task)
|
||||
@ -668,10 +641,7 @@ class InputPreprocessor:
|
||||
self._separate_enc_dec_inputs_from_mm_processor_outputs(
|
||||
encoder_inputs, decoder_inputs))
|
||||
else:
|
||||
inputs = await self._prompt_to_llm_inputs_async(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
)
|
||||
inputs = await self._prompt_to_llm_inputs_async(prompt)
|
||||
if self.model_config.is_multimodal_model and (
|
||||
self._can_process_multimodal()):
|
||||
# Encoder-Decoder Multimodal model
|
||||
@ -704,7 +674,6 @@ class InputPreprocessor:
|
||||
def _process_decoder_only_prompt(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
@ -716,7 +685,6 @@ class InputPreprocessor:
|
||||
Arguments:
|
||||
|
||||
* prompt: input prompt
|
||||
* request_id
|
||||
* lora_request
|
||||
* prompt_adapter_request
|
||||
* return_mm_hashes
|
||||
@ -728,7 +696,6 @@ class InputPreprocessor:
|
||||
|
||||
prompt_comps = self._prompt_to_llm_inputs(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
@ -741,7 +708,6 @@ class InputPreprocessor:
|
||||
async def _process_decoder_only_prompt_async(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
@ -749,7 +715,6 @@ class InputPreprocessor:
|
||||
"""Async version of :meth:`_process_decoder_only_prompt`."""
|
||||
prompt_comps = await self._prompt_to_llm_inputs_async(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
@ -762,7 +727,6 @@ class InputPreprocessor:
|
||||
def preprocess(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
@ -774,10 +738,7 @@ class InputPreprocessor:
|
||||
"returned until they are supported on vLLM V1.")
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
return self._process_encoder_decoder_prompt(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
)
|
||||
return self._process_encoder_decoder_prompt(prompt)
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
@ -786,7 +747,6 @@ class InputPreprocessor:
|
||||
# Decoder-only operation
|
||||
return self._process_decoder_only_prompt(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
@ -795,7 +755,6 @@ class InputPreprocessor:
|
||||
async def preprocess_async(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
request_id: str,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
@ -807,10 +766,7 @@ class InputPreprocessor:
|
||||
"returned until they are supported on vLLM V1.")
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
return await self._process_encoder_decoder_prompt_async(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
)
|
||||
return await self._process_encoder_decoder_prompt_async(prompt)
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
@ -819,7 +775,6 @@ class InputPreprocessor:
|
||||
# Decoder-only operation
|
||||
return await self._process_decoder_only_prompt_async(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
|
||||
@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, List, Optional, Tuple, Union, final
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.lora.layers import LoRAMapping
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
|
||||
@ -42,8 +43,15 @@ class PunicaWrapperGPU(PunicaWrapperBase):
|
||||
self.token_mapping_meta = LoRAKernelMeta.make(self.max_loras,
|
||||
max_num_batched_tokens,
|
||||
device=device)
|
||||
|
||||
# When cudagraph capture size is greater than max_num_seqs (max_batches,
|
||||
# here), V0 captures the graph as if max_num_seqs is set to
|
||||
# the capture size.
|
||||
# V1 doesn't have this problem and always respects max_num_seqs.
|
||||
max_num_prompts = (max_batches
|
||||
if envs.VLLM_USE_V1 else max_num_batched_tokens)
|
||||
self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
|
||||
max_batches,
|
||||
max_num_prompts,
|
||||
device=device)
|
||||
|
||||
def update_metadata(
|
||||
|
||||
@ -5,7 +5,77 @@ from typing import Optional, Tuple, Union
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def is_rocm_aiter_rmsnorm_enabled() -> bool:
|
||||
return current_platform.is_rocm() \
|
||||
and envs.VLLM_ROCM_USE_AITER_RMSNORM \
|
||||
and envs.VLLM_ROCM_USE_AITER
|
||||
|
||||
|
||||
def rms_norm(x: torch.Tensor, weight: torch.Tensor,
|
||||
variance_epsilon: float) -> torch.Tensor:
|
||||
from vllm import _custom_ops as ops
|
||||
out = torch.empty_like(x)
|
||||
ops.rms_norm(
|
||||
out,
|
||||
x,
|
||||
weight,
|
||||
variance_epsilon,
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
def fused_add_rms_norm(
|
||||
x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
|
||||
variance_epsilon: float) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
from vllm import _custom_ops as ops
|
||||
ops.fused_add_rms_norm(
|
||||
x,
|
||||
residual,
|
||||
weight,
|
||||
variance_epsilon,
|
||||
)
|
||||
return x, residual
|
||||
|
||||
|
||||
def rocm_aiter_rms_norm(x: torch.Tensor, weight: torch.Tensor,
|
||||
variance_epsilon: float) -> torch.Tensor:
|
||||
|
||||
import aiter as rocm_aiter
|
||||
return rocm_aiter.rms_norm(x, weight, variance_epsilon)
|
||||
|
||||
|
||||
def rocm_aiter_fused_add_rms_norm(
|
||||
x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
|
||||
variance_epsilon: float) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
|
||||
import aiter as rocm_aiter
|
||||
|
||||
# Assuming the correct signature for rmsnorm2d_fwd_with_add
|
||||
rocm_aiter.rmsnorm2d_fwd_with_add(
|
||||
x, # output
|
||||
x, # input
|
||||
residual, # residual input
|
||||
residual, # residual output
|
||||
weight,
|
||||
variance_epsilon,
|
||||
)
|
||||
return x, residual
|
||||
|
||||
|
||||
def dispatch_cuda_rmsnorm_func(add_residual: bool):
|
||||
if add_residual:
|
||||
if is_rocm_aiter_rmsnorm_enabled():
|
||||
return rocm_aiter_fused_add_rms_norm
|
||||
return fused_add_rms_norm
|
||||
|
||||
if is_rocm_aiter_rmsnorm_enabled():
|
||||
return rocm_aiter_rms_norm
|
||||
return rms_norm
|
||||
|
||||
|
||||
@CustomOp.register("rms_norm")
|
||||
@ -81,24 +151,14 @@ class RMSNorm(CustomOp):
|
||||
if self.variance_size_override is not None:
|
||||
return self.forward_native(x, residual)
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
add_residual = residual is not None
|
||||
norm_func = dispatch_cuda_rmsnorm_func(add_residual)
|
||||
|
||||
if residual is not None:
|
||||
ops.fused_add_rms_norm(
|
||||
x,
|
||||
residual,
|
||||
self.weight.data,
|
||||
self.variance_epsilon,
|
||||
)
|
||||
return x, residual
|
||||
out = torch.empty_like(x)
|
||||
ops.rms_norm(
|
||||
out,
|
||||
x,
|
||||
self.weight.data,
|
||||
self.variance_epsilon,
|
||||
)
|
||||
return out
|
||||
if add_residual:
|
||||
return norm_func(x, residual, self.weight.data,
|
||||
self.variance_epsilon)
|
||||
else:
|
||||
return norm_func(x, self.weight.data, self.variance_epsilon)
|
||||
|
||||
def forward_hpu(
|
||||
self,
|
||||
|
||||
@ -251,6 +251,9 @@ class MambaMixer2(CustomOp):
|
||||
"then num_groups must equal 1."
|
||||
)
|
||||
|
||||
assert self.tp_size == 1 or quant_config is None, \
|
||||
"Tensor parallel currently not supported for quantized models."
|
||||
|
||||
self.ssm_state_size = ssm_state_size
|
||||
self.activation = activation
|
||||
|
||||
@ -331,22 +334,24 @@ class MambaMixer2(CustomOp):
|
||||
], self.tp_size, tp_rank)
|
||||
})
|
||||
|
||||
delattr(self.in_proj.weight, "weight_loader")
|
||||
set_weight_attrs(
|
||||
self.in_proj.weight,
|
||||
{
|
||||
"weight_loader":
|
||||
mamba_v2_sharded_weight_loader(
|
||||
[
|
||||
intermediate_settings, # for gate
|
||||
intermediate_settings,
|
||||
group_shard_settings,
|
||||
group_shard_settings,
|
||||
head_setings, # for dt
|
||||
],
|
||||
self.tp_size,
|
||||
tp_rank)
|
||||
})
|
||||
if quant_config is None:
|
||||
# - quant layers do not have a weight loader
|
||||
delattr(self.in_proj.weight, "weight_loader")
|
||||
set_weight_attrs(
|
||||
self.in_proj.weight,
|
||||
{
|
||||
"weight_loader":
|
||||
mamba_v2_sharded_weight_loader(
|
||||
[
|
||||
intermediate_settings, # for gate
|
||||
intermediate_settings,
|
||||
group_shard_settings,
|
||||
group_shard_settings,
|
||||
head_setings, # for dt
|
||||
],
|
||||
self.tp_size,
|
||||
tp_rank)
|
||||
})
|
||||
|
||||
# - these are TPed by heads to reduce the size of the
|
||||
# temporal shape
|
||||
@ -465,10 +470,11 @@ class MambaMixer2(CustomOp):
|
||||
if has_prefill:
|
||||
|
||||
initial_states = None
|
||||
if has_initial_states is not None and any(has_initial_states):
|
||||
for idx in mamba_cache_params.state_indices_tensor[
|
||||
~has_initial_states]:
|
||||
mamba_cache_params.ssm_state[idx].zero_()
|
||||
if has_initial_states is not None and torch.any(
|
||||
has_initial_states):
|
||||
zero_init_indices = mamba_cache_params.state_indices_tensor[
|
||||
~has_initial_states]
|
||||
mamba_cache_params.ssm_state[zero_init_indices] = 0
|
||||
initial_states = mamba_cache_params.ssm_state[
|
||||
mamba_cache_params.state_indices_tensor]
|
||||
|
||||
@ -494,8 +500,8 @@ class MambaMixer2(CustomOp):
|
||||
|
||||
# update ssm states
|
||||
# - varlen state is a (batch, nheads, headdim, dstate) tensor
|
||||
for i, idx in enumerate(mamba_cache_params.state_indices_tensor):
|
||||
mamba_cache_params.ssm_state[idx].copy_(varlen_state[i])
|
||||
mamba_cache_params.ssm_state[
|
||||
mamba_cache_params.state_indices_tensor] = varlen_state
|
||||
|
||||
# - reshape
|
||||
hidden_states = scan_output.view(seq_len, -1)
|
||||
|
||||
@ -1,204 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# ruff: noqa: SIM117
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import openvino as ov
|
||||
import torch
|
||||
from huggingface_hub import HfApi
|
||||
from openvino._offline_transformations import paged_attention_transformation
|
||||
from optimum.intel import OVModelForCausalLM
|
||||
from torch import nn
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
|
||||
_prune_hidden_states)
|
||||
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def _flatten_inputs(inputs):
|
||||
"""
|
||||
Helper function for making nested inputs flattens
|
||||
"""
|
||||
flatten_inputs = []
|
||||
for input_data in inputs:
|
||||
if input_data is None:
|
||||
continue
|
||||
if isinstance(input_data, (list, tuple)):
|
||||
flatten_inputs.extend(_flatten_inputs(input_data))
|
||||
elif isinstance(input_data, dict):
|
||||
flatten_inputs.extend(_flatten_inputs(list(input_data.values())))
|
||||
else:
|
||||
flatten_inputs.append(input_data)
|
||||
return flatten_inputs
|
||||
|
||||
|
||||
def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type,
|
||||
is_cpu: bool):
|
||||
# Apply hardware dependent modifications to KV tensors
|
||||
for parameter in model.get_parameters():
|
||||
input = parameter.get_output_tensor(0)
|
||||
input_names = input.get_names()
|
||||
if len(input_names) != 1:
|
||||
continue
|
||||
input_name = next(iter(input_names))
|
||||
shape = parameter.get_partial_shape()
|
||||
# use real block size if available, just a placeholder
|
||||
# to provide the expected rank
|
||||
num_blocks = ov.Dimension()
|
||||
block_size = ov.Dimension()
|
||||
head_size = ov.Dimension()
|
||||
if input_name.startswith("key_cache."):
|
||||
cpu_shape = [num_blocks, shape[1], block_size, head_size]
|
||||
gpu_shape = [num_blocks, shape[1], shape[2], block_size]
|
||||
elif input_name.startswith("value_cache."):
|
||||
cpu_shape = [num_blocks, shape[1], block_size, head_size]
|
||||
gpu_shape = [num_blocks, shape[1], block_size, shape[2]]
|
||||
else:
|
||||
continue
|
||||
parameter.set_partial_shape(
|
||||
ov.PartialShape(cpu_shape if is_cpu else gpu_shape))
|
||||
parameter.set_element_type(kv_cache_dtype)
|
||||
model.validate_nodes_and_infer_types()
|
||||
|
||||
|
||||
def _require_model_export(model_id, revision=None, subfolder=None):
|
||||
model_dir = Path(model_id)
|
||||
if subfolder is not None:
|
||||
model_dir = model_dir / subfolder
|
||||
if model_dir.is_dir():
|
||||
return (not (model_dir / "openvino_model.xml").exists()
|
||||
or not (model_dir / "openvino_model.bin").exists())
|
||||
|
||||
hf_api = HfApi()
|
||||
try:
|
||||
model_info = hf_api.model_info(model_id, revision=revision or "main")
|
||||
normalized_subfolder = (None if subfolder is None else
|
||||
Path(subfolder).as_posix())
|
||||
model_files = [
|
||||
file.rfilename for file in model_info.siblings
|
||||
if normalized_subfolder is None
|
||||
or file.rfilename.startswith(normalized_subfolder)
|
||||
]
|
||||
ov_model_path = ("openvino_model.xml" if normalized_subfolder is None
|
||||
else f"{normalized_subfolder}/openvino_model.xml")
|
||||
return (ov_model_path not in model_files
|
||||
or ov_model_path.replace(".xml", ".bin") not in model_files)
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
|
||||
class OpenVINOCausalLM(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ov_core: ov.Core,
|
||||
model_config: ModelConfig,
|
||||
kv_cache_dtype: ov.Type,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.logits_processor = LogitsProcessor(
|
||||
model_config.hf_config.vocab_size, logits_as_input=True)
|
||||
self.sampler = Sampler()
|
||||
|
||||
export = _require_model_export(model_config.model)
|
||||
if export:
|
||||
logger.warning(
|
||||
f"Provided model id {model_config.model} does not " # noqa: G004
|
||||
"contain OpenVINO IR, the model will be converted to IR with "
|
||||
"default options. If you need to use specific options for "
|
||||
"model conversion, use optimum-cli export openvino with "
|
||||
"desired options.")
|
||||
else:
|
||||
logger.warning(
|
||||
"OpenVINO IR is available for provided model id " # noqa: G004
|
||||
f"{model_config.model}. This IR will be used for inference "
|
||||
"as-is, all possible options that may affect model conversion "
|
||||
"are ignored.")
|
||||
|
||||
load_in_8bit = (envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
|
||||
if export else False)
|
||||
pt_model = OVModelForCausalLM.from_pretrained(
|
||||
model_config.model,
|
||||
export=export,
|
||||
compile=False,
|
||||
load_in_8bit=load_in_8bit,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
|
||||
ov_device = envs.VLLM_OPENVINO_DEVICE
|
||||
paged_attention_transformation(pt_model.model)
|
||||
_modify_cache_parameters(pt_model.model, kv_cache_dtype,
|
||||
current_platform.is_openvino_cpu())
|
||||
|
||||
ov_compiled = ov_core.compile_model(pt_model.model, ov_device)
|
||||
self.ov_request = ov_compiled.create_infer_request()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: list[tuple[ov.Tensor, ov.Tensor]],
|
||||
) -> torch.Tensor:
|
||||
flat_kv_caches = _flatten_inputs(kv_caches)
|
||||
attn_metadata = get_forward_context().attn_metadata
|
||||
|
||||
inputs = [
|
||||
input_ids,
|
||||
positions,
|
||||
*flat_kv_caches,
|
||||
attn_metadata.past_lens,
|
||||
attn_metadata.subsequence_begins,
|
||||
attn_metadata.block_indices,
|
||||
attn_metadata.block_indices_begins,
|
||||
attn_metadata.max_context_len,
|
||||
]
|
||||
|
||||
self.ov_request.start_async(inputs, share_inputs=True)
|
||||
self.ov_request.wait()
|
||||
|
||||
logits = torch.from_numpy(self.ov_request.get_tensor("logits").data)
|
||||
|
||||
# TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
|
||||
return logits.view(-1, logits.shape[-1])
|
||||
|
||||
def compute_logits(self, hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata) -> torch.Tensor:
|
||||
hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
|
||||
logits = self.logits_processor(None, hidden_states, sampling_metadata)
|
||||
return logits
|
||||
|
||||
def sample(
|
||||
self,
|
||||
logits: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[SamplerOutput]:
|
||||
next_tokens = self.sampler(logits, sampling_metadata)
|
||||
return next_tokens
|
||||
|
||||
|
||||
def get_model(
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_dtype: ov.Type,
|
||||
**kwargs,
|
||||
) -> torch.nn.Module:
|
||||
lora_config = kwargs.get("lora_config")
|
||||
ov_core = kwargs.get("ov_core")
|
||||
if lora_config:
|
||||
raise ValueError(
|
||||
"OpenVINO modeling does not support LoRA, "
|
||||
"but LoRA is enabled. Support for this model may "
|
||||
"be added in the future. If this is important to you, "
|
||||
"please open an issue on github.")
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
return OpenVINOCausalLM(ov_core, vllm_config.model_config,
|
||||
kv_cache_dtype)
|
||||
@ -233,7 +233,13 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
|
||||
class LlavaProcessingInfo(BaseLlavaProcessingInfo):
|
||||
|
||||
def get_hf_processor(self, **kwargs: object):
|
||||
return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
|
||||
hf_processor = self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
|
||||
# In case patch_size is omitted from `processor_config.json`
|
||||
# e.g. for E5-V: https://huggingface.co/royokong/e5-v
|
||||
if hf_processor.patch_size is None:
|
||||
patch_size = self.get_vision_encoder_info().get_patch_size()
|
||||
hf_processor.patch_size = patch_size
|
||||
return hf_processor
|
||||
|
||||
|
||||
class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
|
||||
@ -608,6 +608,17 @@ class Qwen2_5_VisionTransformer(nn.Module):
|
||||
window_index = torch.cat(window_index, dim=0)
|
||||
return window_index, cu_window_seqlens
|
||||
|
||||
def compute_attn_mask_seqlen(
|
||||
self,
|
||||
cu_seqlens: torch.Tensor,
|
||||
) -> tuple[Optional[int], Optional[list[int]]]:
|
||||
max_seqlen, seqlens = None, None
|
||||
if self.attn_backend == _Backend.FLASH_ATTN:
|
||||
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
|
||||
elif self.attn_backend == _Backend.XFORMERS:
|
||||
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
||||
return max_seqlen, seqlens
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
@ -645,23 +656,27 @@ class Qwen2_5_VisionTransformer(nn.Module):
|
||||
# transformers
|
||||
hidden_states = hidden_states.unsqueeze(1)
|
||||
|
||||
max_seqlen = None
|
||||
seqlens = None
|
||||
if self.attn_backend == _Backend.FLASH_ATTN:
|
||||
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
|
||||
elif self.attn_backend == _Backend.XFORMERS:
|
||||
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
||||
# pre-compute seqlens for window/full attn to reduce cuMemcpy operations
|
||||
max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(
|
||||
cu_seqlens)
|
||||
max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
|
||||
cu_window_seqlens)
|
||||
for layer_num, blk in enumerate(self.blocks):
|
||||
if layer_num in self.fullatt_block_indexes:
|
||||
cu_seqlens_now = cu_seqlens
|
||||
max_seqlen_now = max_seqlen_full
|
||||
seqlens_now = seqlens_full
|
||||
else:
|
||||
cu_seqlens_now = cu_window_seqlens
|
||||
max_seqlen_now = max_seqlen_window
|
||||
seqlens_now = seqlens_window
|
||||
|
||||
hidden_states = blk(
|
||||
hidden_states,
|
||||
cu_seqlens=cu_seqlens_now,
|
||||
rotary_pos_emb=rotary_pos_emb,
|
||||
max_seqlen=max_seqlen,
|
||||
seqlens=seqlens,
|
||||
max_seqlen=max_seqlen_now,
|
||||
seqlens=seqlens_now,
|
||||
)
|
||||
|
||||
# For Qwen2.5-VL-3B, float16 will overflow at last block
|
||||
|
||||
@ -617,6 +617,16 @@ class Qwen2VisionTransformer(nn.Module):
|
||||
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
|
||||
return rotary_pos_emb
|
||||
|
||||
def compute_attn_mask_seqlen(
|
||||
self, cu_seqlens: torch.Tensor
|
||||
) -> tuple[Optional[int], Optional[list[int]]]:
|
||||
max_seqlen, seqlens = None, None
|
||||
if self.attn_backend == _Backend.FLASH_ATTN:
|
||||
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
|
||||
elif self.attn_backend == _Backend.XFORMERS:
|
||||
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
||||
return max_seqlen, seqlens
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
@ -638,12 +648,8 @@ class Qwen2VisionTransformer(nn.Module):
|
||||
# transformers
|
||||
x = x.unsqueeze(1)
|
||||
|
||||
max_seqlen = None
|
||||
seqlens = None
|
||||
if self.attn_backend == _Backend.FLASH_ATTN:
|
||||
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
|
||||
elif self.attn_backend == _Backend.XFORMERS:
|
||||
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
||||
# pre-compute seqlens for attn mask to reduce cuMemcpy operations
|
||||
max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
|
||||
for blk in self.blocks:
|
||||
x = blk(
|
||||
x,
|
||||
|
||||
@ -104,6 +104,7 @@ _TEXT_GENERATION_MODELS = {
|
||||
"Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
|
||||
"SolarForCausalLM": ("solar", "SolarForCausalLM"),
|
||||
"TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
|
||||
"TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
|
||||
"XverseForCausalLM": ("llama", "LlamaForCausalLM"),
|
||||
"Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
|
||||
# [Encoder-decoder]
|
||||
@ -418,11 +419,13 @@ class _ModelRegistry:
|
||||
if not architectures:
|
||||
logger.warning("No model architectures are specified")
|
||||
|
||||
normalized_arch = []
|
||||
for model in architectures:
|
||||
if model not in self.models:
|
||||
model = "TransformersModel"
|
||||
normalized_arch.append(model)
|
||||
# filter out support architectures
|
||||
normalized_arch = list(
|
||||
filter(lambda model: model in self.models, architectures))
|
||||
|
||||
# make sure Transformers fallback are put at the last
|
||||
if len(normalized_arch) != len(architectures):
|
||||
normalized_arch.append("TransformersModel")
|
||||
return normalized_arch
|
||||
|
||||
def inspect_model_cls(
|
||||
|
||||
79
vllm/model_executor/models/teleflm.py
Normal file
79
vllm/model_executor/models/teleflm.py
Normal file
@ -0,0 +1,79 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Type
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.models.llama import (LlamaDecoderLayer,
|
||||
LlamaForCausalLM, LlamaModel)
|
||||
|
||||
|
||||
class TeleFLMModel(LlamaModel):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
vllm_config: VllmConfig,
|
||||
prefix: str = "",
|
||||
layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer,
|
||||
):
|
||||
super().__init__(vllm_config=vllm_config,
|
||||
prefix=prefix,
|
||||
layer_type=layer_type)
|
||||
"""
|
||||
This implementation is based on the µScaling paper presented at
|
||||
the ICLR 2025 Workshop:
|
||||
NanoLM: An Affordable LLM Study Benchmark \
|
||||
via Accurate Loss Prediction across Scales
|
||||
by Yiqun Yao et al.
|
||||
Available at: https://openreview.net/forum?id=IwaPYg1SCA
|
||||
arXiv preprint: https://arxiv.org/abs/2304.06875
|
||||
"""
|
||||
self.use_mup = self.config.use_mup
|
||||
if self.use_mup:
|
||||
self.input_mult = self.config.input_mult
|
||||
|
||||
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
|
||||
embedding = self.embed_tokens(input_ids)
|
||||
if self.use_mup:
|
||||
embedding = embedding * self.input_mult
|
||||
return embedding
|
||||
|
||||
|
||||
class TeleFLMForCausalLM(LlamaForCausalLM):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__(vllm_config=vllm_config, prefix=prefix)
|
||||
# mup
|
||||
self.use_mup = self.config.use_mup
|
||||
if self.use_mup:
|
||||
self.mup_scale_factor = self.config.mup_scale_factor
|
||||
self.output_mult = self.config.output_mult / self.mup_scale_factor
|
||||
logit_scale = self.output_mult
|
||||
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
|
||||
self.config.vocab_size,
|
||||
logit_scale)
|
||||
@ -223,7 +223,12 @@ class RequestOutput:
|
||||
if delta:
|
||||
# Slice logprobs delta if applicable
|
||||
if output_logprobs:
|
||||
output_logprobs = output_logprobs[-num_output_tokens:]
|
||||
# num_output_tokens can be 0 when n > 1 and request finishes
|
||||
# before the others
|
||||
if num_output_tokens > 0:
|
||||
output_logprobs = output_logprobs[-num_output_tokens:]
|
||||
else:
|
||||
output_logprobs = None
|
||||
# Don't include prompt if this is after the first output
|
||||
# containing decode token ids
|
||||
if include_prompt and seq.get_output_len() > num_output_tokens:
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
import logging
|
||||
import traceback
|
||||
from contextlib import suppress
|
||||
from itertools import chain
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
@ -191,21 +190,6 @@ def neuron_platform_plugin() -> Optional[str]:
|
||||
return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
|
||||
|
||||
|
||||
def openvino_platform_plugin() -> Optional[str]:
|
||||
is_openvino = False
|
||||
logger.debug("Checking if OpenVINO platform is available.")
|
||||
with suppress(Exception):
|
||||
is_openvino = vllm_version_matches_substr("openvino")
|
||||
if is_openvino:
|
||||
logger.debug("Confirmed OpenVINO platform is available"
|
||||
" because vLLM is built with OpenVINO.")
|
||||
if not is_openvino:
|
||||
logger.debug("OpenVINO platform is not available because"
|
||||
" vLLM is not built with OpenVINO.")
|
||||
|
||||
return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
|
||||
|
||||
|
||||
builtin_platform_plugins = {
|
||||
'tpu': tpu_platform_plugin,
|
||||
'cuda': cuda_platform_plugin,
|
||||
@ -214,7 +198,6 @@ builtin_platform_plugins = {
|
||||
'xpu': xpu_platform_plugin,
|
||||
'cpu': cpu_platform_plugin,
|
||||
'neuron': neuron_platform_plugin,
|
||||
'openvino': openvino_platform_plugin,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -213,9 +213,14 @@ class CudaPlatformBase(Platform):
|
||||
return ("vllm.attention.backends."
|
||||
"flashmla.FlashMLABackend")
|
||||
if use_v1:
|
||||
logger.info_once("Using Flash Attention backend on V1 engine.")
|
||||
return ("vllm.v1.attention.backends.flash_attn."
|
||||
"FlashAttentionBackend")
|
||||
if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
|
||||
logger.info_once("Using Triton backend on V1 engine.")
|
||||
return ("vllm.v1.attention.backends."
|
||||
"triton_attn.TritonAttentionBackend")
|
||||
if cls.has_device_capability(80):
|
||||
logger.info_once("Using Flash Attention backend on V1 engine.")
|
||||
return ("vllm.v1.attention.backends."
|
||||
"flash_attn.FlashAttentionBackend")
|
||||
if selected_backend == _Backend.FLASHINFER:
|
||||
logger.info("Using FlashInfer backend.")
|
||||
return "vllm.attention.backends.flashinfer.FlashInferBackend"
|
||||
|
||||
@ -29,10 +29,10 @@ def in_wsl() -> bool:
|
||||
class _Backend(enum.Enum):
|
||||
FLASH_ATTN = enum.auto()
|
||||
FLASH_ATTN_VLLM_V1 = enum.auto()
|
||||
TRITON_ATTN_VLLM_V1 = enum.auto()
|
||||
XFORMERS = enum.auto()
|
||||
ROCM_FLASH = enum.auto()
|
||||
TORCH_SDPA = enum.auto()
|
||||
OPENVINO = enum.auto()
|
||||
FLASHINFER = enum.auto()
|
||||
TRITON_MLA = enum.auto() # Supported by V1
|
||||
FLASHMLA = enum.auto() # Supported by V1
|
||||
@ -52,7 +52,6 @@ class PlatformEnum(enum.Enum):
|
||||
XPU = enum.auto()
|
||||
CPU = enum.auto()
|
||||
NEURON = enum.auto()
|
||||
OPENVINO = enum.auto()
|
||||
OOT = enum.auto()
|
||||
UNSPECIFIED = enum.auto()
|
||||
|
||||
@ -135,9 +134,6 @@ class Platform:
|
||||
def is_neuron(self) -> bool:
|
||||
return self._enum == PlatformEnum.NEURON
|
||||
|
||||
def is_openvino(self) -> bool:
|
||||
return self._enum == PlatformEnum.OPENVINO
|
||||
|
||||
def is_out_of_tree(self) -> bool:
|
||||
return self._enum == PlatformEnum.OOT
|
||||
|
||||
|
||||
@ -1,152 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .interface import Platform, PlatformEnum, _Backend
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
else:
|
||||
VllmConfig = None
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
try:
|
||||
import openvino as ov
|
||||
import openvino.properties.hint as hints
|
||||
except ImportError as e:
|
||||
logger.warning("Failed to import OpenVINO with %r", e)
|
||||
|
||||
|
||||
class OpenVinoPlatform(Platform):
|
||||
_enum = PlatformEnum.OPENVINO
|
||||
device_name: str = "openvino"
|
||||
device_type: str = "openvino"
|
||||
dispatch_key: str = "CPU"
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
|
||||
dtype: torch.dtype, kv_cache_dtype: Optional[str],
|
||||
block_size: int, use_v1: bool,
|
||||
use_mla: bool) -> str:
|
||||
if selected_backend != _Backend.OPENVINO:
|
||||
logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
|
||||
logger.info("Using OpenVINO Attention backend.")
|
||||
return "vllm.attention.backends.openvino.OpenVINOAttentionBackend"
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
return "openvino"
|
||||
|
||||
@classmethod
|
||||
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def inference_mode(cls):
|
||||
return torch.inference_mode(mode=True)
|
||||
|
||||
@classmethod
|
||||
def is_openvino_cpu(cls) -> bool:
|
||||
return "CPU" in envs.VLLM_OPENVINO_DEVICE
|
||||
|
||||
@classmethod
|
||||
def is_openvino_gpu(cls) -> bool:
|
||||
return "GPU" in envs.VLLM_OPENVINO_DEVICE
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls) -> bool:
|
||||
logger.warning("Pin memory is not supported on OpenViNO.")
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
assert (parallel_config.world_size == 1
|
||||
), "OpenVINO only supports single CPU socket currently."
|
||||
|
||||
if parallel_config.worker_cls == "auto":
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.worker.openvino_worker.OpenVINOWorker"
|
||||
|
||||
# check and update model config
|
||||
model_config = vllm_config.model_config
|
||||
if model_config.dtype != torch.float32:
|
||||
logger.warning(
|
||||
f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}." # noqa: G004, E501
|
||||
)
|
||||
model_config.dtype = torch.float32
|
||||
if not model_config.enforce_eager:
|
||||
logger.warning(
|
||||
"CUDA graph is not supported on OpenVINO backend, fallback to "
|
||||
"the eager mode.")
|
||||
model_config.enforce_eager = True
|
||||
|
||||
# check and update cache config
|
||||
ov_core = ov.Core()
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 16
|
||||
|
||||
if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
|
||||
if not OpenVinoPlatform.is_openvino_cpu():
|
||||
logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is "
|
||||
"ignored for GPU, f16 data type will be used.")
|
||||
cache_config.cache_dtype = ov.Type.f16
|
||||
else:
|
||||
logger.info("KV cache type is overridden to u8 via "
|
||||
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
|
||||
cache_config.cache_dtype = ov.Type.u8
|
||||
else:
|
||||
if OpenVinoPlatform.is_openvino_cpu():
|
||||
ov_device = envs.VLLM_OPENVINO_DEVICE
|
||||
inference_precision = ov_core.get_property(
|
||||
ov_device, hints.inference_precision)
|
||||
if inference_precision == ov.Type.bf16:
|
||||
cache_config.cache_dtype = ov.Type.bf16
|
||||
else:
|
||||
cache_config.cache_dtype = ov.Type.f16
|
||||
else:
|
||||
cache_config.cache_dtype = ov.Type.f16
|
||||
|
||||
if OpenVinoPlatform.is_openvino_cpu():
|
||||
if cache_config.block_size != 32:
|
||||
logger.info(
|
||||
f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}" # noqa: G004, E501
|
||||
)
|
||||
cache_config.block_size = 32
|
||||
else:
|
||||
if cache_config.block_size != 16:
|
||||
logger.info(
|
||||
f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}" # noqa: G004, E501
|
||||
)
|
||||
cache_config.block_size = 16
|
||||
|
||||
kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
|
||||
if kv_cache_space >= 0:
|
||||
if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu():
|
||||
cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes # type: ignore
|
||||
logger.warning(
|
||||
"Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
|
||||
"for OpenVINO backend is not set, using 4 by default.")
|
||||
else:
|
||||
cache_config.openvino_kvcache_space_bytes = ( # type: ignore
|
||||
kv_cache_space * GiB_bytes)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
|
||||
f" {kv_cache_space}, expect a positive integer value.")
|
||||
|
||||
assert vllm_config.device_config.device_type == "openvino"
|
||||
assert vllm_config.lora_config is None, \
|
||||
"OpenVINO backend doesn't support LoRA"
|
||||
assert cls.is_openvino_cpu() or \
|
||||
cls.is_openvino_gpu(), \
|
||||
"OpenVINO backend supports only CPU and GPU devices"
|
||||
@ -120,8 +120,9 @@ class RocmPlatform(Platform):
|
||||
selected_backend = (_Backend.ROCM_FLASH if selected_backend
|
||||
== _Backend.FLASH_ATTN else selected_backend)
|
||||
if envs.VLLM_USE_V1:
|
||||
logger.info("Using ROCm Attention backend on V1 engine.")
|
||||
return "vllm.v1.attention.backends.rocm_attn.ROCmAttentionBackend"
|
||||
logger.info("Using Triton Attention backend on V1 engine.")
|
||||
return ("vllm.v1.attention.backends."
|
||||
"triton_attn.TritonAttentionBackend")
|
||||
if selected_backend == _Backend.ROCM_FLASH:
|
||||
if not cls.has_device_capability(90):
|
||||
# not Instinct series GPUs.
|
||||
|
||||
@ -33,7 +33,6 @@ class BaseTokenizerGroup(ABC):
|
||||
@abstractmethod
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group."""
|
||||
@ -43,7 +42,6 @@ class BaseTokenizerGroup(ABC):
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group."""
|
||||
|
||||
@ -113,7 +113,6 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
|
||||
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group.
|
||||
@ -133,8 +132,7 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
|
||||
original_actor = actor
|
||||
try:
|
||||
ret = ray.get(
|
||||
actor.encode.remote(request_id=request_id,
|
||||
prompt=prompt,
|
||||
actor.encode.remote(prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens))
|
||||
except ActorDiedError as e:
|
||||
@ -145,8 +143,7 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
|
||||
actor = self._init_actor()
|
||||
try:
|
||||
ret = ray.get(
|
||||
actor.encode.remote(request_id=request_id,
|
||||
prompt=prompt,
|
||||
actor.encode.remote(prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens))
|
||||
except ActorDiedError as e:
|
||||
@ -164,7 +161,6 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> List[int]:
|
||||
"""Encode a prompt using the tokenizer group.
|
||||
@ -184,7 +180,6 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
|
||||
original_actor = actor
|
||||
try:
|
||||
ret = await actor.encode.remote(
|
||||
request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens)
|
||||
@ -196,7 +191,6 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
|
||||
actor = self._init_actor()
|
||||
try:
|
||||
ret = await actor.encode.remote(
|
||||
request_id=request_id,
|
||||
prompt=prompt,
|
||||
lora_request=lora_request,
|
||||
add_special_tokens=add_special_tokens)
|
||||
|
||||
@ -56,7 +56,6 @@ class TokenizerGroup(BaseTokenizerGroup):
|
||||
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> List[int]:
|
||||
tokenizer = self.get_lora_tokenizer(lora_request)
|
||||
@ -69,7 +68,6 @@ class TokenizerGroup(BaseTokenizerGroup):
|
||||
async def encode_async(
|
||||
self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
add_special_tokens: Optional[bool] = None) -> List[int]:
|
||||
tokenizer = await self.get_lora_tokenizer_async(lora_request)
|
||||
|
||||
@ -153,6 +153,7 @@ STR_DTYPE_TO_TORCH_DTYPE = {
|
||||
"fp8": torch.uint8,
|
||||
"fp8_e4m3": torch.uint8,
|
||||
"fp8_e5m2": torch.uint8,
|
||||
"int8": torch.int8,
|
||||
}
|
||||
|
||||
TORCH_DTYPE_TO_NUMPY_DTYPE = {
|
||||
@ -2147,20 +2148,48 @@ def zmq_socket_ctx(path: str, socket_type: Any) -> Iterator[zmq.Socket]:
|
||||
ctx.destroy(linger=0)
|
||||
|
||||
|
||||
def _check_multiproc_method():
|
||||
if (cuda_is_initialized()
|
||||
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
|
||||
logger.warning("CUDA was previously initialized. We must use "
|
||||
"the `spawn` multiprocessing start method. Setting "
|
||||
"VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
|
||||
"See https://docs.vllm.ai/en/latest/getting_started/"
|
||||
"troubleshooting.html#python-multiprocessing "
|
||||
"for more information.")
|
||||
def is_in_ray_actor():
|
||||
"""Check if we are in a Ray actor."""
|
||||
|
||||
try:
|
||||
import ray
|
||||
return (ray.is_initialized()
|
||||
and ray.get_runtime_context().get_actor_id() is not None)
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
|
||||
def _maybe_force_spawn():
|
||||
"""Check if we need to force the use of the `spawn` multiprocessing start
|
||||
method.
|
||||
"""
|
||||
if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn":
|
||||
return
|
||||
|
||||
reason = None
|
||||
if cuda_is_initialized():
|
||||
reason = "CUDA is initialized"
|
||||
elif is_in_ray_actor():
|
||||
reason = "In a Ray actor and can only be spawned"
|
||||
|
||||
if reason is not None:
|
||||
logger.warning(
|
||||
"We must use the `spawn` multiprocessing start method. "
|
||||
"Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
|
||||
"See https://docs.vllm.ai/en/latest/getting_started/"
|
||||
"troubleshooting.html#python-multiprocessing "
|
||||
"for more information. Reason: %s", reason)
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
|
||||
def get_mp_context():
|
||||
_check_multiproc_method()
|
||||
"""Get a multiprocessing context with a particular method (spawn or fork).
|
||||
By default we follow the value of the VLLM_WORKER_MULTIPROC_METHOD to
|
||||
determine the multiprocessing method (default is fork). However, under
|
||||
certain conditions, we may enforce spawn and override the value of
|
||||
VLLM_WORKER_MULTIPROC_METHOD.
|
||||
"""
|
||||
_maybe_force_spawn()
|
||||
mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
|
||||
return multiprocessing.get_context(mp_method)
|
||||
|
||||
@ -2360,3 +2389,51 @@ def swap_dict_values(obj: dict[_K, _V], key1: _K, key2: _K) -> None:
|
||||
obj[key1] = v2
|
||||
else:
|
||||
obj.pop(key1, None)
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def cprofile_context(save_file: Optional[str] = None):
|
||||
"""Run a cprofile
|
||||
|
||||
Args:
|
||||
save_file: path to save the profile result. "1" or
|
||||
None will result in printing to stdout.
|
||||
"""
|
||||
import cProfile
|
||||
|
||||
prof = cProfile.Profile()
|
||||
prof.enable()
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
prof.disable()
|
||||
if save_file and save_file != "1":
|
||||
prof.dump_stats(save_file)
|
||||
else:
|
||||
prof.print_stats(sort="cumtime")
|
||||
|
||||
|
||||
def cprofile(save_file: Optional[str] = None, enabled: bool = True):
|
||||
"""Decorator to profile a Python method using cProfile.
|
||||
|
||||
Args:
|
||||
save_file: Path to save the profile result.
|
||||
If "1", None, or "", results will be printed to stdout.
|
||||
enabled: Set to false to turn this into a no-op
|
||||
"""
|
||||
|
||||
def decorator(func: Callable):
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
if not enabled:
|
||||
# If profiling is disabled, just call the function directly.
|
||||
return func(*args, **kwargs)
|
||||
|
||||
with cprofile_context(save_file):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
@ -17,7 +17,7 @@ from vllm.platforms import current_platform
|
||||
from vllm.utils import cdiv
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.v1.core.scheduler_output import SchedulerOutput
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
||||
|
||||
|
||||
@ -195,8 +195,8 @@ from vllm import _custom_ops as ops
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
|
||||
AttentionMetadata,
|
||||
MLAAttentionImpl)
|
||||
from vllm.attention.backends.utils import get_flash_attn_version
|
||||
from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
|
||||
from vllm.fa_utils import get_flash_attn_version
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
LinearBase, RowParallelLinear,
|
||||
@ -212,7 +212,7 @@ except ImportError:
|
||||
from flash_attn import flash_attn_varlen_func
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.v1.core.scheduler_output import SchedulerOutput
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Attention layer with PagedAttention on rocm"""
|
||||
"""Attention layer with PagedAttention and Triton prefix prefill."""
|
||||
from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
@ -16,7 +16,7 @@ from vllm.v1.attention.backends.flash_attn import (
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class ROCmAttentionBackend(AttentionBackend):
|
||||
class TritonAttentionBackend(AttentionBackend):
|
||||
|
||||
accept_output_buffer: bool = True
|
||||
|
||||
@ -26,11 +26,11 @@ class ROCmAttentionBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "ROCM_ATTN_VLLM_V1"
|
||||
return "TRITON_ATTN_VLLM_V1"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> type["ROCmAttentionImpl"]:
|
||||
return ROCmAttentionImpl
|
||||
def get_impl_cls() -> type["TritonAttentionImpl"]:
|
||||
return TritonAttentionImpl
|
||||
|
||||
@staticmethod
|
||||
def get_metadata_cls() -> type["AttentionMetadata"]:
|
||||
@ -56,7 +56,7 @@ class ROCmAttentionBackend(AttentionBackend):
|
||||
return FlashAttentionMetadataBuilder
|
||||
|
||||
|
||||
class ROCmAttentionImpl(AttentionImpl):
|
||||
class TritonAttentionImpl(AttentionImpl):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -73,7 +73,7 @@ class ROCmAttentionImpl(AttentionImpl):
|
||||
) -> None:
|
||||
if blocksparse_params is not None:
|
||||
raise ValueError(
|
||||
"ROCmAttention does not support block-sparse attention.")
|
||||
"TritonAttention does not support block-sparse attention.")
|
||||
self.num_heads = num_heads
|
||||
self.head_size = head_size
|
||||
self.scale = float(scale)
|
||||
@ -90,17 +90,17 @@ class ROCmAttentionImpl(AttentionImpl):
|
||||
assert self.num_heads % self.num_kv_heads == 0
|
||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||
|
||||
support_head_sizes = ROCmAttentionBackend.get_supported_head_sizes()
|
||||
support_head_sizes = TritonAttentionBackend.get_supported_head_sizes()
|
||||
if head_size not in support_head_sizes:
|
||||
raise ValueError(
|
||||
f"Head size {head_size} is not supported by ROCmAttention. "
|
||||
f"Head size {head_size} is not supported by TritonAttention. "
|
||||
f"Supported head sizes are: {support_head_sizes}.")
|
||||
|
||||
if attn_type != AttentionType.DECODER:
|
||||
raise NotImplementedError("Encoder self-attention and "
|
||||
"encoder/decoder cross-attention "
|
||||
"are not implemented for "
|
||||
"ROCmAttentionImpl")
|
||||
"TritonAttentionImpl")
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -7,8 +7,8 @@ from typing import Any, NamedTuple, Optional
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec,
|
||||
KVCacheTensor)
|
||||
from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheGroupSpec,
|
||||
KVCacheSpec, KVCacheTensor)
|
||||
from vllm.v1.metrics.stats import PrefixCacheStats
|
||||
from vllm.v1.request import Request
|
||||
|
||||
@ -449,7 +449,7 @@ def hash_request_tokens(block_size: int,
|
||||
|
||||
|
||||
def check_enough_kv_cache_memory(vllm_config: VllmConfig,
|
||||
kv_cache_spec: KVCacheSpec,
|
||||
kv_cache_spec: dict[str, KVCacheSpec],
|
||||
available_memory: int):
|
||||
"""
|
||||
Checks whether `available_memory` is enough for the KV cache to hold at
|
||||
@ -457,7 +457,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_spec: The kv cache spec of the model
|
||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||
available_memory: Memory available for KV cache in bytes.
|
||||
|
||||
Raises:
|
||||
@ -484,12 +484,43 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
|
||||
f"`max_model_len` when initializing the engine.")
|
||||
|
||||
|
||||
def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool:
|
||||
def create_kv_cache_group_specs(
|
||||
kv_cache_spec: dict[str, KVCacheSpec],
|
||||
grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]:
|
||||
"""
|
||||
Create KVCacheGroupSpec object for each kv cache group layer.
|
||||
The layers in the same group should share the same
|
||||
KVCacheSpec.
|
||||
|
||||
Args:
|
||||
kv_cache_spec:
|
||||
A mapping from each layer name to its corresponding KVCacheSpec.
|
||||
grouped_layer_names:
|
||||
A list of kv cache groups, where each element is a list of layer
|
||||
names that belong to the same group and should share the same
|
||||
KVCacheSpec.
|
||||
Returns:
|
||||
A list of KVCacheGroupSpec objects, one for each group.
|
||||
"""
|
||||
kv_cache_groups = []
|
||||
for layer_names_one_group in grouped_layer_names:
|
||||
layer_spec = kv_cache_spec[layer_names_one_group[0]]
|
||||
assert all(
|
||||
kv_cache_spec[layer_name] == layer_spec
|
||||
for layer_name in layer_names_one_group[1:]), (
|
||||
"All layers in the same KV cache group must share the same "
|
||||
"KVCacheSpec.")
|
||||
kv_cache_groups.append(
|
||||
KVCacheGroupSpec(layer_names_one_group, layer_spec))
|
||||
return kv_cache_groups
|
||||
|
||||
|
||||
def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
|
||||
"""
|
||||
Whether all layers in the given KVCacheSpec have the same type of KV cache.
|
||||
|
||||
Args:
|
||||
kv_cache_spec: The KVCacheSpec of the model
|
||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||
|
||||
Returns:
|
||||
True if all layers have the same type, False otherwise.
|
||||
@ -500,18 +531,16 @@ def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool:
|
||||
|
||||
|
||||
def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
|
||||
kv_cache_spec: KVCacheSpec,
|
||||
available_memory: int,
|
||||
num_layers: int) -> KVCacheConfig:
|
||||
kv_cache_spec: dict[str, KVCacheSpec],
|
||||
available_memory: int) -> KVCacheConfig:
|
||||
"""
|
||||
Generates the KV cache configuration for a model with one type of KV cache.
|
||||
Divide the available memory equally among all layers.
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_spec: The kv cache spec of the model
|
||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||
available_memory: Memory available for KV cache in bytes.
|
||||
num_layers: The number of layers in the model.
|
||||
|
||||
Returns:
|
||||
The generated KVCacheConfig
|
||||
@ -521,7 +550,7 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
|
||||
assert len(page_sizes) == 1
|
||||
page_size = page_sizes.pop()
|
||||
|
||||
num_blocks = int(available_memory // page_size // num_layers)
|
||||
num_blocks = int(available_memory // page_size // len(kv_cache_spec))
|
||||
num_blocks = max(num_blocks, 0)
|
||||
|
||||
if vllm_config.cache_config.num_gpu_blocks_override is not None:
|
||||
@ -541,6 +570,9 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
|
||||
max_model_len_str, max_concurrency)
|
||||
|
||||
per_layer_size = page_size * num_blocks
|
||||
# All layers have the same KV cache spec, so we create one kv cache group
|
||||
# for all layers.
|
||||
grouped_layer_names = [list(kv_cache_spec.keys())]
|
||||
|
||||
kv_cache_config = KVCacheConfig(
|
||||
num_blocks=num_blocks,
|
||||
@ -548,41 +580,69 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
|
||||
layer_name: KVCacheTensor(size=per_layer_size)
|
||||
for layer_name in kv_cache_spec
|
||||
},
|
||||
groups=[[layer_name for layer_name in kv_cache_spec]],
|
||||
kv_cache_spec=kv_cache_spec)
|
||||
kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
|
||||
grouped_layer_names),
|
||||
)
|
||||
return kv_cache_config
|
||||
|
||||
|
||||
def get_kv_cache_configs(vllm_config: VllmConfig,
|
||||
kv_cache_specs: list[KVCacheSpec],
|
||||
available_memory: int) -> list[KVCacheConfig]:
|
||||
def get_kv_cache_config(vllm_config: VllmConfig,
|
||||
kv_cache_spec: dict[str, KVCacheSpec],
|
||||
available_memory: int) -> KVCacheConfig:
|
||||
"""
|
||||
Generates the KV cache configuration for a model
|
||||
TODO: support hybrid models with more than one type of KV cache.
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_specs: The kv cache specs of the model
|
||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||
available_memory: Memory available for KV cache in bytes.
|
||||
|
||||
Returns:
|
||||
The generated KVCacheConfigs
|
||||
"""
|
||||
# Use the max number of layers to conservatively determine
|
||||
# the number of blocks.
|
||||
num_layers = max(len(kv_cache_spec) for kv_cache_spec in kv_cache_specs)
|
||||
kv_cache_configs = []
|
||||
for kv_cache_spec in kv_cache_specs:
|
||||
check_enough_kv_cache_memory(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
if is_kv_cache_type_uniform(kv_cache_spec):
|
||||
# KV cache of all layers are the same, which is true for
|
||||
# most models. Allocate the same amount of memory for
|
||||
# each layer.
|
||||
kv_cache_configs.append(
|
||||
_get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
|
||||
available_memory,
|
||||
num_layers))
|
||||
else:
|
||||
raise NotImplementedError
|
||||
check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
|
||||
if is_kv_cache_type_uniform(kv_cache_spec):
|
||||
# KV cache of all layers are the same, which is true for
|
||||
# most models. Allocate the same amount of memory for
|
||||
# each layer.
|
||||
return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
|
||||
available_memory)
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
|
||||
"""
|
||||
Make the KV cache configurations for each worker consistent, so that all
|
||||
workers can be controlled by the same KVCacheManager.
|
||||
This function verifies that the layer group of each worker are the same,
|
||||
and changes the num_blocks of each worker to the smallest among all workers.
|
||||
|
||||
Args:
|
||||
kv_cache_configs: The KV cache configurations for each worker. Will be
|
||||
in-place modified to make them consistent.
|
||||
"""
|
||||
|
||||
# Sort the kv cache groups by the type_id of their KV cache spec.
|
||||
# This can avoid the inconsistency caused by the order of groups.
|
||||
for kv_cache_config in kv_cache_configs:
|
||||
kv_cache_config.kv_cache_groups.sort(
|
||||
key=lambda x: x.kv_cache_spec.type_id)
|
||||
|
||||
# Verify that the groups of each rank are the same.
|
||||
for kv_cache_config in kv_cache_configs[1:]:
|
||||
for group_rank_0, group_rank_i in zip(
|
||||
kv_cache_configs[0].kv_cache_groups,
|
||||
kv_cache_config.kv_cache_groups):
|
||||
assert group_rank_0.kv_cache_spec == group_rank_i.kv_cache_spec
|
||||
|
||||
# Change the num_blocks of each rank to the smallest among all ranks. We
|
||||
# do not need to shrink the tensor size because it is valid to only use the
|
||||
# first `num_blocks` blocks of the tensor.
|
||||
min_num_blocks = min(kv_cache_config.num_blocks
|
||||
for kv_cache_config in kv_cache_configs)
|
||||
for kv_cache_config in kv_cache_configs:
|
||||
kv_cache_config.num_blocks = min_num_blocks
|
||||
|
||||
return kv_cache_configs
|
||||
|
||||
0
vllm/v1/core/sched/__init__.py
Normal file
0
vllm/v1/core/sched/__init__.py
Normal file
139
vllm/v1/core/sched/interface.py
Normal file
139
vllm/v1/core/sched/interface.py
Normal file
@ -0,0 +1,139 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Iterable
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.engine import EngineCoreOutputs
|
||||
from vllm.v1.metrics.stats import SchedulerStats
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
|
||||
class SchedulerInterface(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def schedule(self) -> "SchedulerOutput":
|
||||
"""Schedule the requests to process in this scheduling step.
|
||||
|
||||
The scheduling decision is made at the iteration level. Each scheduling
|
||||
step corresponds to a single forward pass of the model. Therefore, this
|
||||
method is called repeatedly by a busy loop in the engine.
|
||||
|
||||
Essentially, the scheduler produces a dictionary of {req_id: num_tokens}
|
||||
that specifies how many tokens to process for each request in this
|
||||
scheduling step. For example, num_tokens can be as large as the number
|
||||
of prompt tokens for new requests, or it can be 1 for the requests that
|
||||
are auto-regressively generating new tokens one by one. Otherwise, it
|
||||
can be somewhere in between in case of chunked prefills, prefix caching,
|
||||
speculative decoding, etc.
|
||||
|
||||
Additionally, the scheduler also returns useful data about each request
|
||||
or the batch as a whole. The model runner will use this information in
|
||||
preparing inputs to the model.
|
||||
|
||||
Returns:
|
||||
A SchedulerOutput object containing information about the scheduled
|
||||
requests.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def update_from_output(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
model_runner_output: "ModelRunnerOutput",
|
||||
) -> "EngineCoreOutputs":
|
||||
"""Update the scheduler state based on the model runner output.
|
||||
|
||||
This method is called after the model runner has processed the scheduled
|
||||
requests. The model runner output includes generated token ids, draft
|
||||
token ids for next step, etc. The scheduler uses this information to
|
||||
update its states, checks the finished requests, and returns the output
|
||||
for each request.
|
||||
|
||||
Returns:
|
||||
A EngineCoreOutputs object containing the outputs for each request.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def add_request(self, request: "Request") -> None:
|
||||
"""Add a new request to the scheduler's internal queue.
|
||||
|
||||
Args:
|
||||
request: The new request being added.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def finish_requests(
|
||||
self,
|
||||
request_ids: Union[str, Iterable[str]],
|
||||
finished_status: "RequestStatus",
|
||||
) -> None:
|
||||
"""Finish the requests in the scheduler's internal queue. If the request
|
||||
is not in the queue, this method will do nothing.
|
||||
|
||||
This method is called in two cases:
|
||||
1. When the request is aborted by the client.
|
||||
2. When the frontend process detects a stop string of the request after
|
||||
de-tokenizing its generated tokens.
|
||||
|
||||
Args:
|
||||
request_ids: A single or a list of request IDs.
|
||||
finished_status: The finished status of the given requests.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_num_unfinished_requests(self) -> int:
|
||||
"""Number of unfinished requests in the scheduler's internal queue."""
|
||||
raise NotImplementedError
|
||||
|
||||
def has_unfinished_requests(self) -> bool:
|
||||
"""Returns True if there are unfinished requests in the scheduler's
|
||||
internal queue."""
|
||||
return self.get_num_unfinished_requests() > 0
|
||||
|
||||
@abstractmethod
|
||||
def has_finished_requests(self) -> bool:
|
||||
"""Returns True if there are finished requests that need to be cleared.
|
||||
NOTE: This is different from `not self.has_unfinished_requests()`.
|
||||
|
||||
The scheduler maintains an internal list of the requests finished in the
|
||||
previous step. This list is returned from the next call to schedule(),
|
||||
to be sent to the model runner in the next step to clear cached states
|
||||
for these finished requests.
|
||||
|
||||
This method checks if this internal list of finished requests is
|
||||
non-empty. This information is useful for DP attention.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def has_requests(self) -> bool:
|
||||
"""Returns True if there are unfinished requests, or finished requests
|
||||
not yet returned in SchedulerOutputs."""
|
||||
return self.has_unfinished_requests() or self.has_finished_requests()
|
||||
|
||||
@abstractmethod
|
||||
def get_num_unscheduled_requests(self) -> int:
|
||||
"""Number of requests that are not being processed by the executor."""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def reset_prefix_cache(self) -> bool:
|
||||
"""Reset the prefix cache for KV cache.
|
||||
|
||||
This is particularly required when the model weights are live-updated.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def make_stats(self) -> Optional["SchedulerStats"]:
|
||||
"""Make a SchedulerStats object for logging.
|
||||
|
||||
The SchedulerStats object is created for every scheduling step.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@ -13,8 +13,10 @@ from vllm.logger import init_logger
|
||||
from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
|
||||
compute_encoder_budget)
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheManager
|
||||
from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
|
||||
SchedulerOutput)
|
||||
from vllm.v1.core.sched.interface import SchedulerInterface
|
||||
from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
|
||||
SchedulerOutput)
|
||||
from vllm.v1.core.sched.utils import check_stop
|
||||
from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
|
||||
EngineCoreOutputs)
|
||||
from vllm.v1.metrics.stats import SchedulerStats
|
||||
@ -25,7 +27,7 @@ from vllm.v1.structured_output import StructuredOutputManager
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class Scheduler:
|
||||
class Scheduler(SchedulerInterface):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -602,7 +604,7 @@ class Scheduler:
|
||||
|
||||
# Check for stop and update request state.
|
||||
# This must be called before we make the EngineCoreOutput.
|
||||
stopped = self._check_stop(request)
|
||||
stopped = check_stop(request, self.max_model_len)
|
||||
if stopped:
|
||||
self._free_request(request)
|
||||
break
|
||||
@ -648,25 +650,6 @@ class Scheduler:
|
||||
scheduler_stats=self.make_stats(),
|
||||
)
|
||||
|
||||
def _check_stop(self, request: Request) -> bool:
|
||||
if (request.num_tokens >= self.max_model_len
|
||||
or request.num_output_tokens >= request.max_tokens):
|
||||
request.status = RequestStatus.FINISHED_LENGTH_CAPPED
|
||||
return True
|
||||
|
||||
sampling_params = request.sampling_params
|
||||
last_token_id = request.output_token_ids[-1]
|
||||
if (not sampling_params.ignore_eos
|
||||
and last_token_id == request.eos_token_id):
|
||||
request.status = RequestStatus.FINISHED_STOPPED
|
||||
return True
|
||||
|
||||
if last_token_id in (sampling_params.stop_token_ids or ()):
|
||||
request.status = RequestStatus.FINISHED_STOPPED
|
||||
request.stop_reason = last_token_id
|
||||
return True
|
||||
return False
|
||||
|
||||
def add_request(self, request: Request) -> None:
|
||||
self.waiting.append(request)
|
||||
self.requests[request.request_id] = request
|
||||
@ -715,17 +698,9 @@ class Scheduler:
|
||||
def get_num_unfinished_requests(self) -> int:
|
||||
return len(self.waiting) + len(self.running)
|
||||
|
||||
def has_unfinished_requests(self) -> bool:
|
||||
return self.get_num_unfinished_requests() > 0
|
||||
|
||||
def has_finished_requests(self) -> bool:
|
||||
return len(self.finished_req_ids) > 0
|
||||
|
||||
def has_requests(self):
|
||||
"""Returns True if there are unfinished requests, or finished requests
|
||||
not yet returned in SchedulerOutputs."""
|
||||
return self.has_unfinished_requests() or self.has_finished_requests()
|
||||
|
||||
def get_num_unscheduled_requests(self) -> int:
|
||||
"""Number of requests that are not being processed by the executor."""
|
||||
return self.get_num_unfinished_requests() - len(self.scheduled_req_ids)
|
||||
22
vllm/v1/core/sched/utils.py
Normal file
22
vllm/v1/core/sched/utils.py
Normal file
@ -0,0 +1,22 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
|
||||
|
||||
def check_stop(request: Request, max_model_len: int) -> bool:
|
||||
if (request.num_tokens >= max_model_len
|
||||
or request.num_output_tokens >= request.max_tokens):
|
||||
request.status = RequestStatus.FINISHED_LENGTH_CAPPED
|
||||
return True
|
||||
|
||||
sampling_params = request.sampling_params
|
||||
last_token_id = request.output_token_ids[-1]
|
||||
if (not sampling_params.ignore_eos
|
||||
and last_token_id == request.eos_token_id):
|
||||
request.status = RequestStatus.FINISHED_STOPPED
|
||||
return True
|
||||
|
||||
if last_token_id in (sampling_params.stop_token_ids or ()):
|
||||
request.status = RequestStatus.FINISHED_STOPPED
|
||||
request.stop_reason = last_token_id
|
||||
return True
|
||||
return False
|
||||
@ -4,6 +4,7 @@ import asyncio
|
||||
import logging
|
||||
import os
|
||||
from collections.abc import AsyncGenerator, Mapping
|
||||
from copy import copy
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
@ -25,6 +26,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import Device, cdiv, kill_process_tree
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.core_client import EngineCoreClient
|
||||
from vllm.v1.engine.output_processor import OutputProcessor
|
||||
from vllm.v1.engine.parallel_sampling import ParentRequest
|
||||
@ -177,34 +179,45 @@ class AsyncLLM(EngineClient):
|
||||
) -> asyncio.Queue[RequestOutput]:
|
||||
"""Add new request to the AsyncLLM."""
|
||||
|
||||
# 1) Create a new output queue for the request.
|
||||
# Create a new output queue for the request.
|
||||
queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
|
||||
|
||||
# 2) Fan out child requests (for n>1)
|
||||
parent_req = ParentRequest.from_params(request_id, params)
|
||||
# Convert Input --> Request.
|
||||
request = self.processor.process_inputs(request_id, prompt, params,
|
||||
arrival_time, lora_request,
|
||||
trace_headers,
|
||||
prompt_adapter_request,
|
||||
priority)
|
||||
|
||||
n = params.n if isinstance(params, SamplingParams) else 1
|
||||
|
||||
if n == 1:
|
||||
await self._add_request(request, None, 0, queue)
|
||||
return queue
|
||||
|
||||
# Fan out child requests (for n>1).
|
||||
parent_request = ParentRequest(request_id, params)
|
||||
for idx in range(n):
|
||||
if parent_req is not None:
|
||||
request_id, params = parent_req.get_child_info(idx)
|
||||
|
||||
# 3) Convert Input --> Request.
|
||||
request = self.processor.process_inputs(request_id, prompt, params,
|
||||
arrival_time, lora_request,
|
||||
trace_headers,
|
||||
prompt_adapter_request,
|
||||
priority)
|
||||
|
||||
# 4) Add the request to OutputProcessor (this process).
|
||||
self.output_processor.add_request(request, parent_req, idx, queue)
|
||||
|
||||
# 5) Add the EngineCoreRequest to EngineCore (separate process).
|
||||
await self.engine_core.add_request_async(request)
|
||||
|
||||
if self.log_requests:
|
||||
logger.info("Added request %s.", request_id)
|
||||
|
||||
request_id, params = parent_request.get_child_info(idx)
|
||||
child_request = request if idx == n - 1 else copy(request)
|
||||
child_request.request_id = request_id
|
||||
child_request.sampling_params = params
|
||||
await self._add_request(child_request, parent_request, idx, queue)
|
||||
return queue
|
||||
|
||||
async def _add_request(self, request: EngineCoreRequest,
|
||||
parent_req: Optional[ParentRequest], index: int,
|
||||
queue: asyncio.Queue[RequestOutput]):
|
||||
|
||||
# Add the request to OutputProcessor (this process).
|
||||
self.output_processor.add_request(request, parent_req, index, queue)
|
||||
|
||||
# Add the EngineCoreRequest to EngineCore (separate process).
|
||||
await self.engine_core.add_request_async(request)
|
||||
|
||||
if self.log_requests:
|
||||
logger.info("Added request %s.", request.request_id)
|
||||
|
||||
# TODO: we should support multiple prompts in one call, as you
|
||||
# can do with LLM.generate. So that for multi-prompt completion
|
||||
# requests we don't need to send multiple messages to core proc,
|
||||
|
||||
@ -21,9 +21,10 @@ from vllm.transformers_utils.config import (
|
||||
maybe_register_config_serialize_by_value)
|
||||
from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname,
|
||||
zmq_socket_ctx)
|
||||
from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
|
||||
from vllm.v1.core.scheduler import Scheduler as V1Scheduler
|
||||
from vllm.v1.core.scheduler import SchedulerOutput
|
||||
from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
|
||||
unify_kv_cache_configs)
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
|
||||
from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
|
||||
EngineCoreRequestType, UtilityOutput)
|
||||
from vllm.v1.engine.mm_input_cache import MMInputCacheServer
|
||||
@ -120,15 +121,27 @@ class EngineCore:
|
||||
# memory can be allocated for kv cache.
|
||||
available_gpu_memory = self.model_executor.determine_available_memory()
|
||||
|
||||
assert len(kv_cache_specs) == len(available_gpu_memory)
|
||||
# Get the kv cache tensor size
|
||||
kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
|
||||
available_gpu_memory)
|
||||
num_gpu_blocks_set = set(config.num_blocks
|
||||
for config in kv_cache_configs)
|
||||
assert len(num_gpu_blocks_set) == 1, (
|
||||
f"num_gpu_blocks need to be the same across workers, "
|
||||
f"but they are different: {num_gpu_blocks_set}")
|
||||
num_gpu_blocks = num_gpu_blocks_set.pop()
|
||||
kv_cache_configs = [
|
||||
get_kv_cache_config(vllm_config, kv_cache_spec_one_worker,
|
||||
available_gpu_memory_one_worker)
|
||||
for kv_cache_spec_one_worker, available_gpu_memory_one_worker in
|
||||
zip(kv_cache_specs, available_gpu_memory)
|
||||
]
|
||||
|
||||
# Since we use a shared centralized controller, we need the
|
||||
# `kv_cache_config` to be consistent across all workers to make sure
|
||||
# all the memory operators can be applied to all workers.
|
||||
unify_kv_cache_configs(kv_cache_configs)
|
||||
|
||||
# All workers have the same kv_cache_config except layer names, so use
|
||||
# an arbitrary one to get the number of blocks.
|
||||
assert all([
|
||||
cfg.num_blocks == kv_cache_configs[0].num_blocks
|
||||
for cfg in kv_cache_configs
|
||||
])
|
||||
num_gpu_blocks = kv_cache_configs[0].num_blocks
|
||||
num_cpu_blocks = 0
|
||||
|
||||
# Initialize kv cache and warmup the execution
|
||||
|
||||
@ -212,9 +212,9 @@ class BackgroundResources:
|
||||
"""Used as a finalizer for clean shutdown, avoiding
|
||||
circular reference back to the client object."""
|
||||
|
||||
ctx: Union[zmq.Context] = None
|
||||
output_socket: Union[zmq.Socket, zmq.asyncio.Socket] = None
|
||||
input_socket: Union[zmq.Socket, zmq.asyncio.Socket] = None
|
||||
ctx: zmq.Context
|
||||
output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
|
||||
input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
|
||||
proc_handle: Optional[BackgroundProcHandle] = None
|
||||
shutdown_path: Optional[str] = None
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from collections.abc import Mapping
|
||||
from copy import copy
|
||||
from typing import Optional, Union
|
||||
|
||||
from typing_extensions import TypeVar
|
||||
@ -179,25 +180,34 @@ class LLMEngine:
|
||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||
priority: int = 0,
|
||||
) -> None:
|
||||
# 1) Fan out child requests (for n>1)
|
||||
parent_req = ParentRequest.from_params(request_id, params)
|
||||
# Process raw inputs into the request.
|
||||
request = self.processor.process_inputs(request_id, prompt, params,
|
||||
arrival_time, lora_request,
|
||||
trace_headers,
|
||||
prompt_adapter_request,
|
||||
priority)
|
||||
|
||||
n = params.n if isinstance(params, SamplingParams) else 1
|
||||
for idx in range(n):
|
||||
if parent_req is not None:
|
||||
request_id, params = parent_req.get_child_info(idx)
|
||||
|
||||
# 2) Process raw inputs into the request.
|
||||
request = self.processor.process_inputs(request_id, prompt, params,
|
||||
arrival_time, lora_request,
|
||||
trace_headers,
|
||||
prompt_adapter_request,
|
||||
priority)
|
||||
|
||||
# 3) Make a new RequestState and queue.
|
||||
self.output_processor.add_request(request, parent_req, idx)
|
||||
|
||||
# 3) Add the request to EngineCore.
|
||||
if n == 1:
|
||||
# Make a new RequestState and queue.
|
||||
self.output_processor.add_request(request, None, 0)
|
||||
# Add the request to EngineCore.
|
||||
self.engine_core.add_request(request)
|
||||
return
|
||||
|
||||
# Fan out child requests (for n>1).
|
||||
parent_req = ParentRequest(request_id, params)
|
||||
for idx in range(n):
|
||||
request_id, params = parent_req.get_child_info(idx)
|
||||
child_request = request if idx == n - 1 else copy(request)
|
||||
child_request.request_id = request_id
|
||||
child_request.sampling_params = params
|
||||
|
||||
# Make a new RequestState and queue.
|
||||
self.output_processor.add_request(child_request, parent_req, idx)
|
||||
# Add the request to EngineCore.
|
||||
self.engine_core.add_request(child_request)
|
||||
|
||||
def step(self) -> list[RequestOutput]:
|
||||
|
||||
|
||||
@ -1,10 +1,9 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from copy import copy
|
||||
from typing import Optional, Union
|
||||
from typing import Optional
|
||||
|
||||
from vllm.outputs import CompletionOutput
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
from vllm.v1.metrics.stats import IterationStats
|
||||
|
||||
@ -43,16 +42,6 @@ class ParentRequest:
|
||||
self.max_num_generation_tokens = 0
|
||||
self.cached_child_sampling_params = None
|
||||
|
||||
@classmethod
|
||||
def from_params(
|
||||
cls,
|
||||
request_id: str,
|
||||
params: Union[SamplingParams, PoolingParams],
|
||||
) -> Optional['ParentRequest']:
|
||||
if not isinstance(params, SamplingParams) or params.n == 1:
|
||||
return None
|
||||
return cls(request_id, params)
|
||||
|
||||
def _get_child_sampling_params(
|
||||
self,
|
||||
index: int,
|
||||
|
||||
@ -120,7 +120,7 @@ class Processor:
|
||||
if not params.guided_decoding or not self.decoding_config:
|
||||
return
|
||||
|
||||
supported_backends = ["xgrammar"]
|
||||
supported_backends = ["xgrammar", "xgrammar:disable-any-whitespace"]
|
||||
engine_level_backend = self.decoding_config.guided_decoding_backend
|
||||
if engine_level_backend not in supported_backends:
|
||||
raise ValueError(f"Only {supported_backends} structured output is "
|
||||
@ -173,7 +173,6 @@ class Processor:
|
||||
# 3. Apply prompt adapter to prompt token ids if one exists.
|
||||
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
||||
prompt,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
return_mm_hashes=self.use_hash,
|
||||
|
||||
@ -62,14 +62,11 @@ class Executor(ExecutorBase):
|
||||
args=(kv_cache_configs, ))
|
||||
self.collective_rpc("compile_or_warm_up_model")
|
||||
|
||||
def determine_available_memory(self) -> int: # in bytes
|
||||
def determine_available_memory(self) -> list[int]: # in bytes
|
||||
output = self.collective_rpc("determine_available_memory")
|
||||
# Since we use a shared centralized controller, we take the minimum
|
||||
# memory size across all workers to make sure all the memory
|
||||
# operators can be applied to all workers.
|
||||
return min(output)
|
||||
return output
|
||||
|
||||
def get_kv_cache_specs(self) -> list[KVCacheSpec]:
|
||||
def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
|
||||
output = self.collective_rpc("get_kv_cache_spec")
|
||||
return output
|
||||
|
||||
@ -95,7 +92,7 @@ class UniProcExecutor(UniProcExecutorV0, Executor):
|
||||
|
||||
class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
|
||||
|
||||
def determine_available_memory(self) -> int: # in bytes
|
||||
def determine_available_memory(self) -> list[int]: # in bytes
|
||||
# same as determine_num_available_blocks in v0,
|
||||
# we need to get the min across all ranks.
|
||||
memory = super().determine_available_memory()
|
||||
@ -103,4 +100,4 @@ class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
|
||||
cpu_group = get_world_group().cpu_group
|
||||
memory_tensor = torch.tensor([memory], device="cpu", dtype=torch.int64)
|
||||
dist.all_reduce(memory_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
|
||||
return memory_tensor.item()
|
||||
return [memory_tensor.item()]
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user