mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-22 10:14:37 +08:00
Remove openvino support in favor of external plugin (#15339)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
parent
dd861b992f
commit
b877031d80
@ -1,16 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t openvino-test -f Dockerfile.openvino .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() { docker rm -f openvino-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Run the image and launch offline inference
|
||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
@ -1,29 +0,0 @@
|
||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||
# to run the OpenAI compatible server.
|
||||
|
||||
FROM ubuntu:22.04 AS dev
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y \
|
||||
git python3-pip \
|
||||
ffmpeg libsm6 libxext6 libgl1
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
RUN python3 -m pip install -U pip
|
||||
# install build requirements
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt
|
||||
# build vLLM with OpenVINO backend
|
||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
|
||||
|
||||
COPY examples/ /workspace/examples
|
||||
COPY benchmarks/ /workspace/benchmarks
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@ -26,4 +26,3 @@ installation/ai_accelerator
|
||||
- Google TPU
|
||||
- Intel Gaudi
|
||||
- AWS Neuron
|
||||
- OpenVINO
|
||||
|
||||
@ -36,16 +36,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "# Installation"
|
||||
:end-before: "## Requirements"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Requirements
|
||||
@ -83,16 +73,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "## Requirements"
|
||||
:end-before: "## Set up using Python"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Configure a new environment
|
||||
@ -130,14 +110,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} python_env_setup.inc.md
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Set up using Python
|
||||
@ -177,16 +149,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "### Pre-built wheels"
|
||||
:end-before: "### Build wheel from source"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
### Build wheel from source
|
||||
@ -224,16 +186,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "### Build wheel from source"
|
||||
:end-before: "## Set up using Docker"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Set up using Docker
|
||||
@ -273,16 +225,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "### Pre-built images"
|
||||
:end-before: "### Build image from source"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
### Build image from source
|
||||
@ -320,16 +262,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "### Build image from source"
|
||||
:end-before: "## Extra information"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Extra information
|
||||
@ -364,13 +296,4 @@ vLLM is a Python library that supports the following AI accelerators. Select you
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} OpenVINO
|
||||
:sync: openvino
|
||||
|
||||
:::{include} ai_accelerator/openvino.inc.md
|
||||
:start-after: "## Extra information"
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
@ -1,110 +0,0 @@
|
||||
# Installation
|
||||
|
||||
vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
|
||||
|
||||
:::{attention}
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
:::
|
||||
|
||||
## Requirements
|
||||
|
||||
- OS: Linux
|
||||
- Instruction set architecture (ISA) requirement: at least AVX2.
|
||||
|
||||
## Set up using Python
|
||||
|
||||
### Pre-built wheels
|
||||
|
||||
Currently, there are no pre-built OpenVINO wheels.
|
||||
|
||||
### Build wheel from source
|
||||
|
||||
First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run:
|
||||
|
||||
```console
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install python3
|
||||
pip install --upgrade pip
|
||||
```
|
||||
|
||||
Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend installation:
|
||||
|
||||
```console
|
||||
git clone https://github.com/vllm-project/vllm.git
|
||||
cd vllm
|
||||
pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
```
|
||||
|
||||
Finally, install vLLM with OpenVINO backend:
|
||||
|
||||
```console
|
||||
PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
|
||||
```
|
||||
|
||||
:::{tip}
|
||||
To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
|
||||
:::
|
||||
|
||||
## Set up using Docker
|
||||
|
||||
### Pre-built images
|
||||
|
||||
Currently, there are no pre-built OpenVINO images.
|
||||
|
||||
### Build image from source
|
||||
|
||||
```console
|
||||
docker build -f Dockerfile.openvino -t vllm-openvino-env .
|
||||
docker run -it --rm vllm-openvino-env
|
||||
```
|
||||
|
||||
## Extra information
|
||||
|
||||
## Supported features
|
||||
|
||||
OpenVINO vLLM backend supports the following advanced vLLM features:
|
||||
|
||||
- Prefix caching (`--enable-prefix-caching`)
|
||||
- Chunked prefill (`--enable-chunked-prefill`)
|
||||
|
||||
## Performance tips
|
||||
|
||||
### vLLM OpenVINO backend environment variables
|
||||
|
||||
- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default.
|
||||
- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
|
||||
|
||||
### CPU performance tips
|
||||
|
||||
CPU uses the following environment variables to control behavior:
|
||||
|
||||
- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
|
||||
- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
|
||||
|
||||
To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
|
||||
|
||||
OpenVINO best known configuration for CPU is:
|
||||
|
||||
```console
|
||||
$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
|
||||
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
|
||||
```
|
||||
|
||||
### GPU performance tips
|
||||
|
||||
GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache).
|
||||
|
||||
Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
|
||||
|
||||
OpenVINO best known configuration for GPU is:
|
||||
|
||||
```console
|
||||
$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
|
||||
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
```
|
||||
|
||||
## Limitations
|
||||
|
||||
- LoRA serving is not supported.
|
||||
- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
|
||||
- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
|
||||
@ -1,8 +0,0 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
|
||||
torch == 2.5.1 # should be aligned with "common" vLLM torch version
|
||||
openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
|
||||
|
||||
optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
|
||||
optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
|
||||
10
setup.py
10
setup.py
@ -449,10 +449,6 @@ def _is_cpu() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "cpu"
|
||||
|
||||
|
||||
def _is_openvino() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "openvino"
|
||||
|
||||
|
||||
def _is_xpu() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "xpu"
|
||||
|
||||
@ -572,8 +568,6 @@ def get_vllm_version() -> str:
|
||||
if gaudi_sw_version != MAIN_CUDA_VERSION:
|
||||
gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
|
||||
version += f"{sep}gaudi{gaudi_sw_version}"
|
||||
elif _is_openvino():
|
||||
version += f"{sep}openvino"
|
||||
elif _is_tpu():
|
||||
version += f"{sep}tpu"
|
||||
elif _is_cpu():
|
||||
@ -623,8 +617,6 @@ def get_requirements() -> list[str]:
|
||||
requirements = _read_requirements("neuron.txt")
|
||||
elif _is_hpu():
|
||||
requirements = _read_requirements("hpu.txt")
|
||||
elif _is_openvino():
|
||||
requirements = _read_requirements("openvino.txt")
|
||||
elif _is_tpu():
|
||||
requirements = _read_requirements("tpu.txt")
|
||||
elif _is_cpu():
|
||||
@ -634,7 +626,7 @@ def get_requirements() -> list[str]:
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
|
||||
"OpenVINO, or CPU.")
|
||||
"or CPU.")
|
||||
return requirements
|
||||
|
||||
|
||||
|
||||
@ -273,8 +273,7 @@ class HfRunner:
|
||||
def get_default_device(self):
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
return ("cpu" if current_platform.is_cpu()
|
||||
or current_platform.is_openvino() else "cuda")
|
||||
return ("cpu" if current_platform.is_cpu() else "cuda")
|
||||
|
||||
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
|
||||
if x is None or isinstance(x, (bool, )):
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from unittest.mock import Mock, patch
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -8,7 +8,6 @@ import torch
|
||||
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
|
||||
from vllm.platforms.cpu import CpuPlatform
|
||||
from vllm.platforms.cuda import CudaPlatform
|
||||
from vllm.platforms.openvino import OpenVinoPlatform
|
||||
from vllm.platforms.rocm import RocmPlatform
|
||||
from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
|
||||
|
||||
@ -21,9 +20,9 @@ def clear_cache():
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
|
||||
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
|
||||
@pytest.mark.parametrize("use_v1", [True, False])
|
||||
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
|
||||
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
|
||||
def test_env(
|
||||
name: str,
|
||||
use_v1: bool,
|
||||
@ -51,13 +50,6 @@ def test_env(
|
||||
16, False)
|
||||
EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
|
||||
assert backend.get_name() == EXPECTED
|
||||
elif device == "openvino":
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
OpenVinoPlatform()), patch.dict('sys.modules',
|
||||
{'openvino': Mock()}):
|
||||
backend = get_attn_backend(16, torch.float16, torch.float16,
|
||||
16, False)
|
||||
assert backend.get_name() == "OPENVINO"
|
||||
else:
|
||||
if name in ["XFORMERS", "FLASHINFER"]:
|
||||
with patch("vllm.attention.selector.current_platform",
|
||||
|
||||
@ -786,7 +786,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
|
||||
without enough resources, or called when filtering tests to run directly.
|
||||
"""
|
||||
try:
|
||||
if current_platform.is_cpu() or current_platform.is_openvino():
|
||||
if current_platform.is_cpu():
|
||||
memory_gb = 0
|
||||
else:
|
||||
memory_gb = current_platform.get_device_total_memory() / GB_bytes
|
||||
|
||||
@ -1,146 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Tuple, Type
|
||||
|
||||
import openvino as ov
|
||||
import torch
|
||||
|
||||
from vllm.attention.backends.abstract import (AttentionBackend,
|
||||
AttentionMetadata)
|
||||
from vllm.attention.backends.utils import CommonAttentionState
|
||||
from vllm.multimodal import MultiModalPlaceholderMap
|
||||
|
||||
|
||||
def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor,
|
||||
src_offset: int, dst_offset: int) -> None:
|
||||
|
||||
def create_roi_tensor(
|
||||
tensor: ov.Tensor,
|
||||
block_number: int,
|
||||
) -> ov.Tensor:
|
||||
roi_begin = ov.runtime.Coordinate([0, 0, 0, 0])
|
||||
roi_end = ov.runtime.Coordinate(tensor.get_shape())
|
||||
|
||||
roi_begin[0] = block_number
|
||||
roi_end[0] = block_number + 1
|
||||
|
||||
if isinstance(tensor, ov.Tensor):
|
||||
return ov.Tensor(tensor, roi_begin, roi_end)
|
||||
else:
|
||||
return ov.RemoteTensor(tensor, roi_begin, roi_end)
|
||||
|
||||
src_roi_tensor = \
|
||||
create_roi_tensor(src_tensor, src_offset)
|
||||
dst_roi_tensor = \
|
||||
create_roi_tensor(dst_tensor, dst_offset)
|
||||
src_roi_tensor.copy_to(dst_roi_tensor)
|
||||
|
||||
|
||||
class OpenVINOAttentionBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_name() -> str:
|
||||
return "OPENVINO"
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls():
|
||||
# OpenVINO implements PagedAttention as part of the Optimum
|
||||
# exported model
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def make_metadata(*args, **kwargs) -> "AttentionMetadata":
|
||||
raise NotImplementedError
|
||||
|
||||
@staticmethod
|
||||
def get_state_cls() -> Type["CommonAttentionState"]:
|
||||
return CommonAttentionState
|
||||
|
||||
@staticmethod
|
||||
def make_openvino_metadata(*args, **kwargs) -> "OpenVINOAttentionMetadata":
|
||||
return OpenVINOAttentionMetadata(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
num_kv_heads: int,
|
||||
head_size: int,
|
||||
) -> Tuple[int, ...]:
|
||||
return (2, num_blocks, num_kv_heads, block_size, head_size)
|
||||
|
||||
@staticmethod
|
||||
def swap_blocks(
|
||||
src_tensor: ov.Tensor,
|
||||
dst_tensor: ov.Tensor,
|
||||
src_to_dists: List[Tuple[int, int]],
|
||||
) -> None:
|
||||
for src, dst in src_to_dists:
|
||||
copy_cache_block(src_tensor, dst_tensor, src, dst)
|
||||
|
||||
@staticmethod
|
||||
def copy_blocks(
|
||||
kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
|
||||
src_to_dists: List[Tuple[int, int]],
|
||||
) -> None:
|
||||
for src, dst in src_to_dists:
|
||||
for key_cache, value_cache in kv_caches:
|
||||
copy_cache_block(key_cache, key_cache, src, dst)
|
||||
copy_cache_block(value_cache, value_cache, src, dst)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenVINOAttentionMetadata:
|
||||
"""Metadata for OpenVINOAttentionBackend.
|
||||
|
||||
Basic terms used below:
|
||||
- batch_size_in_sequences - total number of sequences to execute
|
||||
- prompt_lens – per sequence size number of scheduled tokens
|
||||
- batch_size_in_tokens = sum(prompt_lens)
|
||||
- max_context_len = max(context_lens)
|
||||
- max_num_blocks = div_up(max_context_len / BLOCK_SIZE)
|
||||
- num_blocks – total number of blocks in block_indices
|
||||
"""
|
||||
|
||||
# Describes past KV cache size for each sequence within a batch
|
||||
# Shape: [batch_size_in_sequences]
|
||||
# Type: i32
|
||||
past_lens: torch.Tensor
|
||||
|
||||
# Describes start indices of input / speculative tokens from
|
||||
# current sequences within a batch sequence
|
||||
# Shape: [batch_size_in_sequences + 1]
|
||||
# Type: i32
|
||||
subsequence_begins: torch.Tensor
|
||||
|
||||
# Describes block tables for each sequence within a batch -
|
||||
# indices along 0th dimension in key_cache and value_cache inputs
|
||||
# Shape: [num_blocks]
|
||||
# Type: i32
|
||||
block_indices: torch.Tensor
|
||||
|
||||
# Describes block tables for each sequence within a batch -
|
||||
# for i-th element, it is an index in block_indices with the
|
||||
# first block belonging to i-th sequence
|
||||
# Shape: [batch_size_in_sequences + 1]
|
||||
# Type: i32
|
||||
block_indices_begins: torch.Tensor
|
||||
|
||||
# Describes max context length
|
||||
# Shape: scalar
|
||||
# Type: i32
|
||||
max_context_len: torch.Tensor
|
||||
|
||||
# The index maps that relate multi-modal embeddings to the corresponding
|
||||
# placeholders.
|
||||
#
|
||||
# N.B. These aren't really related to attention and don't belong on this
|
||||
# type -- this is just a temporary solution to make them available to
|
||||
# `model_executable`.
|
||||
multi_modal_placeholder_index_maps: Optional[Dict[
|
||||
str, MultiModalPlaceholderMap.IndexMap]]
|
||||
|
||||
# Enable/disable KV scales calculation. This is so that we can disable the
|
||||
# calculation until after prefill and cuda graph capture.
|
||||
enable_kv_scales_calculation: bool
|
||||
@ -1801,7 +1801,7 @@ class DeviceConfig:
|
||||
self.device_type = device
|
||||
|
||||
# Some device types require processing inputs on CPU
|
||||
if self.device_type in ["neuron", "openvino"]:
|
||||
if self.device_type in ["neuron"]:
|
||||
self.device = torch.device("cpu")
|
||||
elif self.device_type in ["tpu"]:
|
||||
self.device = None
|
||||
|
||||
@ -40,7 +40,6 @@ DEVICE_OPTIONS = [
|
||||
"cuda",
|
||||
"neuron",
|
||||
"cpu",
|
||||
"openvino",
|
||||
"tpu",
|
||||
"xpu",
|
||||
"hpu",
|
||||
|
||||
28
vllm/envs.py
28
vllm/envs.py
@ -40,10 +40,6 @@ if TYPE_CHECKING:
|
||||
VLLM_CPU_KVCACHE_SPACE: int = 0
|
||||
VLLM_CPU_OMP_THREADS_BIND: str = ""
|
||||
VLLM_CPU_MOE_PREPACK: bool = True
|
||||
VLLM_OPENVINO_DEVICE: str = "CPU"
|
||||
VLLM_OPENVINO_KVCACHE_SPACE: int = 0
|
||||
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
|
||||
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
|
||||
VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
|
||||
VLLM_XLA_CHECK_RECOMPILATION: bool = False
|
||||
VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
|
||||
@ -131,7 +127,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# ================== Installation Time Env Vars ==================
|
||||
|
||||
# Target device of vLLM, supporting [cuda (by default),
|
||||
# rocm, neuron, cpu, openvino]
|
||||
# rocm, neuron, cpu]
|
||||
"VLLM_TARGET_DEVICE":
|
||||
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
|
||||
|
||||
@ -358,28 +354,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_CPU_MOE_PREPACK":
|
||||
lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))),
|
||||
|
||||
# OpenVINO device selection
|
||||
# default is CPU
|
||||
"VLLM_OPENVINO_DEVICE":
|
||||
lambda: os.getenv("VLLM_OPENVINO_DEVICE", "CPU").upper(),
|
||||
|
||||
# OpenVINO key-value cache space
|
||||
# default is 4GB
|
||||
"VLLM_OPENVINO_KVCACHE_SPACE":
|
||||
lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),
|
||||
|
||||
# OpenVINO KV cache precision
|
||||
# default is bf16 if natively supported by platform, otherwise f16
|
||||
# To enable KV cache compression, please, explicitly specify u8
|
||||
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":
|
||||
lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),
|
||||
|
||||
# Enables weights compression during model export via HF Optimum
|
||||
# default is False
|
||||
"VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
|
||||
lambda:
|
||||
(os.environ.get("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", "0").lower() in
|
||||
("on", "true", "1")),
|
||||
# If the env var is set, then all workers will execute as separate
|
||||
# processes from the engine, and we use the same mechanism to trigger
|
||||
# execution on all workers.
|
||||
|
||||
@ -1,204 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# ruff: noqa: SIM117
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import openvino as ov
|
||||
import torch
|
||||
from huggingface_hub import HfApi
|
||||
from openvino._offline_transformations import paged_attention_transformation
|
||||
from optimum.intel import OVModelForCausalLM
|
||||
from torch import nn
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
|
||||
_prune_hidden_states)
|
||||
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def _flatten_inputs(inputs):
|
||||
"""
|
||||
Helper function for making nested inputs flattens
|
||||
"""
|
||||
flatten_inputs = []
|
||||
for input_data in inputs:
|
||||
if input_data is None:
|
||||
continue
|
||||
if isinstance(input_data, (list, tuple)):
|
||||
flatten_inputs.extend(_flatten_inputs(input_data))
|
||||
elif isinstance(input_data, dict):
|
||||
flatten_inputs.extend(_flatten_inputs(list(input_data.values())))
|
||||
else:
|
||||
flatten_inputs.append(input_data)
|
||||
return flatten_inputs
|
||||
|
||||
|
||||
def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type,
|
||||
is_cpu: bool):
|
||||
# Apply hardware dependent modifications to KV tensors
|
||||
for parameter in model.get_parameters():
|
||||
input = parameter.get_output_tensor(0)
|
||||
input_names = input.get_names()
|
||||
if len(input_names) != 1:
|
||||
continue
|
||||
input_name = next(iter(input_names))
|
||||
shape = parameter.get_partial_shape()
|
||||
# use real block size if available, just a placeholder
|
||||
# to provide the expected rank
|
||||
num_blocks = ov.Dimension()
|
||||
block_size = ov.Dimension()
|
||||
head_size = ov.Dimension()
|
||||
if input_name.startswith("key_cache."):
|
||||
cpu_shape = [num_blocks, shape[1], block_size, head_size]
|
||||
gpu_shape = [num_blocks, shape[1], shape[2], block_size]
|
||||
elif input_name.startswith("value_cache."):
|
||||
cpu_shape = [num_blocks, shape[1], block_size, head_size]
|
||||
gpu_shape = [num_blocks, shape[1], block_size, shape[2]]
|
||||
else:
|
||||
continue
|
||||
parameter.set_partial_shape(
|
||||
ov.PartialShape(cpu_shape if is_cpu else gpu_shape))
|
||||
parameter.set_element_type(kv_cache_dtype)
|
||||
model.validate_nodes_and_infer_types()
|
||||
|
||||
|
||||
def _require_model_export(model_id, revision=None, subfolder=None):
|
||||
model_dir = Path(model_id)
|
||||
if subfolder is not None:
|
||||
model_dir = model_dir / subfolder
|
||||
if model_dir.is_dir():
|
||||
return (not (model_dir / "openvino_model.xml").exists()
|
||||
or not (model_dir / "openvino_model.bin").exists())
|
||||
|
||||
hf_api = HfApi()
|
||||
try:
|
||||
model_info = hf_api.model_info(model_id, revision=revision or "main")
|
||||
normalized_subfolder = (None if subfolder is None else
|
||||
Path(subfolder).as_posix())
|
||||
model_files = [
|
||||
file.rfilename for file in model_info.siblings
|
||||
if normalized_subfolder is None
|
||||
or file.rfilename.startswith(normalized_subfolder)
|
||||
]
|
||||
ov_model_path = ("openvino_model.xml" if normalized_subfolder is None
|
||||
else f"{normalized_subfolder}/openvino_model.xml")
|
||||
return (ov_model_path not in model_files
|
||||
or ov_model_path.replace(".xml", ".bin") not in model_files)
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
|
||||
class OpenVINOCausalLM(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ov_core: ov.Core,
|
||||
model_config: ModelConfig,
|
||||
kv_cache_dtype: ov.Type,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.logits_processor = LogitsProcessor(
|
||||
model_config.hf_config.vocab_size, logits_as_input=True)
|
||||
self.sampler = Sampler()
|
||||
|
||||
export = _require_model_export(model_config.model)
|
||||
if export:
|
||||
logger.warning(
|
||||
f"Provided model id {model_config.model} does not " # noqa: G004
|
||||
"contain OpenVINO IR, the model will be converted to IR with "
|
||||
"default options. If you need to use specific options for "
|
||||
"model conversion, use optimum-cli export openvino with "
|
||||
"desired options.")
|
||||
else:
|
||||
logger.warning(
|
||||
"OpenVINO IR is available for provided model id " # noqa: G004
|
||||
f"{model_config.model}. This IR will be used for inference "
|
||||
"as-is, all possible options that may affect model conversion "
|
||||
"are ignored.")
|
||||
|
||||
load_in_8bit = (envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
|
||||
if export else False)
|
||||
pt_model = OVModelForCausalLM.from_pretrained(
|
||||
model_config.model,
|
||||
export=export,
|
||||
compile=False,
|
||||
load_in_8bit=load_in_8bit,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
|
||||
ov_device = envs.VLLM_OPENVINO_DEVICE
|
||||
paged_attention_transformation(pt_model.model)
|
||||
_modify_cache_parameters(pt_model.model, kv_cache_dtype,
|
||||
current_platform.is_openvino_cpu())
|
||||
|
||||
ov_compiled = ov_core.compile_model(pt_model.model, ov_device)
|
||||
self.ov_request = ov_compiled.create_infer_request()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: list[tuple[ov.Tensor, ov.Tensor]],
|
||||
) -> torch.Tensor:
|
||||
flat_kv_caches = _flatten_inputs(kv_caches)
|
||||
attn_metadata = get_forward_context().attn_metadata
|
||||
|
||||
inputs = [
|
||||
input_ids,
|
||||
positions,
|
||||
*flat_kv_caches,
|
||||
attn_metadata.past_lens,
|
||||
attn_metadata.subsequence_begins,
|
||||
attn_metadata.block_indices,
|
||||
attn_metadata.block_indices_begins,
|
||||
attn_metadata.max_context_len,
|
||||
]
|
||||
|
||||
self.ov_request.start_async(inputs, share_inputs=True)
|
||||
self.ov_request.wait()
|
||||
|
||||
logits = torch.from_numpy(self.ov_request.get_tensor("logits").data)
|
||||
|
||||
# TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
|
||||
return logits.view(-1, logits.shape[-1])
|
||||
|
||||
def compute_logits(self, hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata) -> torch.Tensor:
|
||||
hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
|
||||
logits = self.logits_processor(None, hidden_states, sampling_metadata)
|
||||
return logits
|
||||
|
||||
def sample(
|
||||
self,
|
||||
logits: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[SamplerOutput]:
|
||||
next_tokens = self.sampler(logits, sampling_metadata)
|
||||
return next_tokens
|
||||
|
||||
|
||||
def get_model(
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_dtype: ov.Type,
|
||||
**kwargs,
|
||||
) -> torch.nn.Module:
|
||||
lora_config = kwargs.get("lora_config")
|
||||
ov_core = kwargs.get("ov_core")
|
||||
if lora_config:
|
||||
raise ValueError(
|
||||
"OpenVINO modeling does not support LoRA, "
|
||||
"but LoRA is enabled. Support for this model may "
|
||||
"be added in the future. If this is important to you, "
|
||||
"please open an issue on github.")
|
||||
|
||||
with set_current_vllm_config(vllm_config):
|
||||
return OpenVINOCausalLM(ov_core, vllm_config.model_config,
|
||||
kv_cache_dtype)
|
||||
@ -2,7 +2,6 @@
|
||||
|
||||
import logging
|
||||
import traceback
|
||||
from contextlib import suppress
|
||||
from itertools import chain
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
@ -191,21 +190,6 @@ def neuron_platform_plugin() -> Optional[str]:
|
||||
return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
|
||||
|
||||
|
||||
def openvino_platform_plugin() -> Optional[str]:
|
||||
is_openvino = False
|
||||
logger.debug("Checking if OpenVINO platform is available.")
|
||||
with suppress(Exception):
|
||||
is_openvino = vllm_version_matches_substr("openvino")
|
||||
if is_openvino:
|
||||
logger.debug("Confirmed OpenVINO platform is available"
|
||||
" because vLLM is built with OpenVINO.")
|
||||
if not is_openvino:
|
||||
logger.debug("OpenVINO platform is not available because"
|
||||
" vLLM is not built with OpenVINO.")
|
||||
|
||||
return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
|
||||
|
||||
|
||||
builtin_platform_plugins = {
|
||||
'tpu': tpu_platform_plugin,
|
||||
'cuda': cuda_platform_plugin,
|
||||
@ -214,7 +198,6 @@ builtin_platform_plugins = {
|
||||
'xpu': xpu_platform_plugin,
|
||||
'cpu': cpu_platform_plugin,
|
||||
'neuron': neuron_platform_plugin,
|
||||
'openvino': openvino_platform_plugin,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -33,7 +33,6 @@ class _Backend(enum.Enum):
|
||||
XFORMERS = enum.auto()
|
||||
ROCM_FLASH = enum.auto()
|
||||
TORCH_SDPA = enum.auto()
|
||||
OPENVINO = enum.auto()
|
||||
FLASHINFER = enum.auto()
|
||||
TRITON_MLA = enum.auto() # Supported by V1
|
||||
FLASHMLA = enum.auto() # Supported by V1
|
||||
@ -53,7 +52,6 @@ class PlatformEnum(enum.Enum):
|
||||
XPU = enum.auto()
|
||||
CPU = enum.auto()
|
||||
NEURON = enum.auto()
|
||||
OPENVINO = enum.auto()
|
||||
OOT = enum.auto()
|
||||
UNSPECIFIED = enum.auto()
|
||||
|
||||
@ -136,9 +134,6 @@ class Platform:
|
||||
def is_neuron(self) -> bool:
|
||||
return self._enum == PlatformEnum.NEURON
|
||||
|
||||
def is_openvino(self) -> bool:
|
||||
return self._enum == PlatformEnum.OPENVINO
|
||||
|
||||
def is_out_of_tree(self) -> bool:
|
||||
return self._enum == PlatformEnum.OOT
|
||||
|
||||
|
||||
@ -1,152 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import TYPE_CHECKING, Optional
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .interface import Platform, PlatformEnum, _Backend
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import VllmConfig
|
||||
else:
|
||||
VllmConfig = None
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
try:
|
||||
import openvino as ov
|
||||
import openvino.properties.hint as hints
|
||||
except ImportError as e:
|
||||
logger.warning("Failed to import OpenVINO with %r", e)
|
||||
|
||||
|
||||
class OpenVinoPlatform(Platform):
|
||||
_enum = PlatformEnum.OPENVINO
|
||||
device_name: str = "openvino"
|
||||
device_type: str = "openvino"
|
||||
dispatch_key: str = "CPU"
|
||||
|
||||
@classmethod
|
||||
def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
|
||||
dtype: torch.dtype, kv_cache_dtype: Optional[str],
|
||||
block_size: int, use_v1: bool,
|
||||
use_mla: bool) -> str:
|
||||
if selected_backend != _Backend.OPENVINO:
|
||||
logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
|
||||
logger.info("Using OpenVINO Attention backend.")
|
||||
return "vllm.attention.backends.openvino.OpenVINOAttentionBackend"
|
||||
|
||||
@classmethod
|
||||
def get_device_name(cls, device_id: int = 0) -> str:
|
||||
return "openvino"
|
||||
|
||||
@classmethod
|
||||
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def inference_mode(cls):
|
||||
return torch.inference_mode(mode=True)
|
||||
|
||||
@classmethod
|
||||
def is_openvino_cpu(cls) -> bool:
|
||||
return "CPU" in envs.VLLM_OPENVINO_DEVICE
|
||||
|
||||
@classmethod
|
||||
def is_openvino_gpu(cls) -> bool:
|
||||
return "GPU" in envs.VLLM_OPENVINO_DEVICE
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls) -> bool:
|
||||
logger.warning("Pin memory is not supported on OpenViNO.")
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
assert (parallel_config.world_size == 1
|
||||
), "OpenVINO only supports single CPU socket currently."
|
||||
|
||||
if parallel_config.worker_cls == "auto":
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.worker.openvino_worker.OpenVINOWorker"
|
||||
|
||||
# check and update model config
|
||||
model_config = vllm_config.model_config
|
||||
if model_config.dtype != torch.float32:
|
||||
logger.warning(
|
||||
f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}." # noqa: G004, E501
|
||||
)
|
||||
model_config.dtype = torch.float32
|
||||
if not model_config.enforce_eager:
|
||||
logger.warning(
|
||||
"CUDA graph is not supported on OpenVINO backend, fallback to "
|
||||
"the eager mode.")
|
||||
model_config.enforce_eager = True
|
||||
|
||||
# check and update cache config
|
||||
ov_core = ov.Core()
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 16
|
||||
|
||||
if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
|
||||
if not OpenVinoPlatform.is_openvino_cpu():
|
||||
logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is "
|
||||
"ignored for GPU, f16 data type will be used.")
|
||||
cache_config.cache_dtype = ov.Type.f16
|
||||
else:
|
||||
logger.info("KV cache type is overridden to u8 via "
|
||||
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
|
||||
cache_config.cache_dtype = ov.Type.u8
|
||||
else:
|
||||
if OpenVinoPlatform.is_openvino_cpu():
|
||||
ov_device = envs.VLLM_OPENVINO_DEVICE
|
||||
inference_precision = ov_core.get_property(
|
||||
ov_device, hints.inference_precision)
|
||||
if inference_precision == ov.Type.bf16:
|
||||
cache_config.cache_dtype = ov.Type.bf16
|
||||
else:
|
||||
cache_config.cache_dtype = ov.Type.f16
|
||||
else:
|
||||
cache_config.cache_dtype = ov.Type.f16
|
||||
|
||||
if OpenVinoPlatform.is_openvino_cpu():
|
||||
if cache_config.block_size != 32:
|
||||
logger.info(
|
||||
f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}" # noqa: G004, E501
|
||||
)
|
||||
cache_config.block_size = 32
|
||||
else:
|
||||
if cache_config.block_size != 16:
|
||||
logger.info(
|
||||
f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}" # noqa: G004, E501
|
||||
)
|
||||
cache_config.block_size = 16
|
||||
|
||||
kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
|
||||
if kv_cache_space >= 0:
|
||||
if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu():
|
||||
cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes # type: ignore
|
||||
logger.warning(
|
||||
"Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
|
||||
"for OpenVINO backend is not set, using 4 by default.")
|
||||
else:
|
||||
cache_config.openvino_kvcache_space_bytes = ( # type: ignore
|
||||
kv_cache_space * GiB_bytes)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
|
||||
f" {kv_cache_space}, expect a positive integer value.")
|
||||
|
||||
assert vllm_config.device_config.device_type == "openvino"
|
||||
assert vllm_config.lora_config is None, \
|
||||
"OpenVINO backend doesn't support LoRA"
|
||||
assert cls.is_openvino_cpu() or \
|
||||
cls.is_openvino_gpu(), \
|
||||
"OpenVINO backend supports only CPU and GPU devices"
|
||||
@ -1,372 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, NamedTuple, Optional, Tuple
|
||||
|
||||
import openvino as ov
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from vllm.attention import get_attn_backend
|
||||
from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.forward_context import set_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor import SamplingMetadata
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.model_loader.openvino import get_model
|
||||
from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
|
||||
MultiModalKwargs, MultiModalPlaceholderMap)
|
||||
from vllm.sequence import SequenceGroupMetadata
|
||||
from vllm.worker.model_runner_base import ModelRunnerBase
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class ModelInput(NamedTuple):
|
||||
input_tokens: torch.Tensor
|
||||
input_positions: torch.Tensor
|
||||
attn_metadata: Optional[OpenVINOAttentionMetadata]
|
||||
seq_lens: List[int]
|
||||
query_lens: List[int]
|
||||
multi_modal_kwargs: BatchedTensorInputs
|
||||
|
||||
@classmethod
|
||||
def empty(cls, device):
|
||||
return ModelInput(input_tokens=torch.empty(0, device=device),
|
||||
input_positions=torch.empty(0, device=device),
|
||||
attn_metadata=None,
|
||||
seq_lens=[],
|
||||
query_lens=[],
|
||||
multi_modal_kwargs={})
|
||||
|
||||
|
||||
class OpenVINOModelRunner(ModelRunnerBase):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ov_core: ov.Core,
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
is_driver_worker: bool = False,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
self.ov_core = ov_core
|
||||
ModelRunnerBase.__init__(self, vllm_config=vllm_config)
|
||||
self.is_driver_worker = is_driver_worker
|
||||
|
||||
self.device = self.device_config.device
|
||||
|
||||
self.kv_cache_dtype = kv_cache_dtype
|
||||
self.sliding_window = self.model_config.get_sliding_window()
|
||||
self.block_size = self.cache_config.block_size
|
||||
|
||||
self.attn_backend = get_attn_backend(
|
||||
self.model_config.get_head_size(),
|
||||
self.model_config.dtype,
|
||||
self.kv_cache_dtype,
|
||||
self.block_size,
|
||||
self.model_config.is_attention_free,
|
||||
)
|
||||
|
||||
# Multi-modal data support
|
||||
self.mm_registry = MULTIMODAL_REGISTRY
|
||||
self.multi_modal_input_mapper = self.mm_registry \
|
||||
.create_input_mapper(self.model_config)
|
||||
|
||||
# Lazy initialization.
|
||||
self.model: nn.Module # Set after init_Model
|
||||
|
||||
def load_model(self) -> None:
|
||||
self.model = get_model(vllm_config=self.vllm_config,
|
||||
kv_cache_dtype=self.kv_cache_dtype,
|
||||
ov_core=self.ov_core)
|
||||
|
||||
def get_model(self) -> nn.Module:
|
||||
return self.model
|
||||
|
||||
def _prepare_model_input(
|
||||
self,
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
) -> ModelInput:
|
||||
"""Prepare the model input based on a given sequence group.
|
||||
|
||||
The API assumes seq_group_metadata_list is sorted by prefill -> decode.
|
||||
|
||||
The result tensors and data structure also batches input in prefill
|
||||
-> decode order. For example,
|
||||
|
||||
- input_tokens[:num_prefill_tokens] contains prefill tokens.
|
||||
- input_tokens[num_prefill_tokens:] contains decode tokens.
|
||||
"""
|
||||
input_tokens: List[int] = []
|
||||
input_positions: List[int] = []
|
||||
|
||||
seq_lens: List[int] = []
|
||||
past_lens: List[int] = []
|
||||
query_lens: List[int] = []
|
||||
multi_modal_kwargs_list: List[MultiModalKwargs] = []
|
||||
multi_modal_placeholder_maps: Dict[
|
||||
str,
|
||||
MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
|
||||
|
||||
subsequence_begins: List[int] = []
|
||||
block_indices: List[int] = []
|
||||
block_indices_begins: List[int] = []
|
||||
|
||||
# initialize beginning of prefix sums
|
||||
subsequence_begins.append(0)
|
||||
block_indices_begins.append(0)
|
||||
|
||||
if len(seq_group_metadata_list) == 0:
|
||||
return ModelInput.empty(self.device)
|
||||
|
||||
for seq_group_metadata in seq_group_metadata_list:
|
||||
seq_ids = list(seq_group_metadata.seq_data.keys())
|
||||
is_prompt = seq_group_metadata.is_prompt
|
||||
|
||||
for seq_id in seq_ids:
|
||||
computed_block_nums = seq_group_metadata.computed_block_nums
|
||||
if (self.scheduler_config is not None
|
||||
and self.scheduler_config.chunked_prefill_enabled
|
||||
and not (computed_block_nums is None
|
||||
or computed_block_nums == [])):
|
||||
raise RuntimeError(
|
||||
"chunked prefill cannot be used with prefix caching "
|
||||
"now.")
|
||||
|
||||
seq_data = seq_group_metadata.seq_data[seq_id]
|
||||
if is_prompt:
|
||||
computed_len = seq_data.get_num_computed_tokens()
|
||||
else:
|
||||
# get_num_computed_tokens is incorrect for spec decoding.
|
||||
# So, we should have a special logic here.
|
||||
# TODO(sang): Fix it.
|
||||
computed_len = seq_data.get_len() - 1
|
||||
|
||||
seq_len = min(
|
||||
seq_data.get_len(),
|
||||
computed_len + seq_group_metadata.token_chunk_size,
|
||||
)
|
||||
if is_prompt:
|
||||
tokens = seq_data.get_token_ids()[computed_len:seq_len]
|
||||
else:
|
||||
# Optimization. get_token_ids requires the entire copy of
|
||||
# tokens.
|
||||
tokens = [seq_data.get_last_token_id()]
|
||||
|
||||
# Prefix cache was hit.
|
||||
# Prefix is not supported with sliding_window
|
||||
prefix_cache_hit = (computed_block_nums is not None
|
||||
and len(computed_block_nums) > 0
|
||||
and self.sliding_window is None
|
||||
and is_prompt)
|
||||
|
||||
block_table = seq_group_metadata.block_tables[seq_id]
|
||||
# TODO(sang): Combine chunked prefill and prefix caching by
|
||||
# only allowing multiple of block_size chunk size.
|
||||
# NOTE: This only works for oooooooxxx style attention.
|
||||
if prefix_cache_hit:
|
||||
assert computed_block_nums is not None
|
||||
computed_len = len(computed_block_nums) * self.block_size
|
||||
tokens = tokens[computed_len:]
|
||||
elif (self.scheduler_config.chunked_prefill_enabled
|
||||
or not is_prompt):
|
||||
if seq_group_metadata.block_tables is not None:
|
||||
# chunked prefill or decode
|
||||
block_table = seq_group_metadata.block_tables[seq_id]
|
||||
if self.sliding_window is not None:
|
||||
# chunked prefill doesn't support sliding window.
|
||||
assert not self.scheduler_config.chunked_prefill_enabled # noqa: E501
|
||||
sliding_window_blocks = (self.sliding_window //
|
||||
self.block_size)
|
||||
block_table = block_table[-sliding_window_blocks:]
|
||||
else:
|
||||
# Only happens when memory profiling runs.
|
||||
block_table = []
|
||||
else:
|
||||
# prompt phase w/o prefix_caching, chunked_prefill
|
||||
pass
|
||||
|
||||
block_indices.extend(block_table)
|
||||
block_indices_begins.append(block_indices_begins[-1] +
|
||||
len(block_table))
|
||||
|
||||
# TODO(sang): This is a hack to make sliding window work with
|
||||
# paged attn. We can remove it if we make paged attn kernel
|
||||
# to properly handle slinding window attn.
|
||||
if self.sliding_window is not None and not is_prompt:
|
||||
seq_len = min(seq_len, self.sliding_window)
|
||||
computed_len = seq_len - 1
|
||||
|
||||
seq_lens.append(seq_len)
|
||||
|
||||
query_len = seq_len - computed_len
|
||||
query_lens.append(query_len)
|
||||
|
||||
input_tokens.extend(tokens)
|
||||
positions_range = range(computed_len, seq_len)
|
||||
input_positions.extend(list(positions_range))
|
||||
|
||||
past_lens.append(computed_len)
|
||||
subsequence_begins.append(subsequence_begins[-1] + query_len)
|
||||
|
||||
if is_prompt:
|
||||
assert len(seq_ids) == 1
|
||||
else:
|
||||
assert (
|
||||
query_len == 1
|
||||
), "seq_len: {}, computed_len: {}, query_len: {}".format(
|
||||
seq_len, computed_len, query_len)
|
||||
|
||||
if seq_group_metadata.multi_modal_data:
|
||||
# NOTE: mm_data only includes the subset of multi-modal
|
||||
# items that intersect with the current prefill positions.
|
||||
mm_data, placeholder_maps = MultiModalPlaceholderMap \
|
||||
.from_seq_group(seq_group_metadata, positions_range)
|
||||
|
||||
if self.mm_registry.has_processor(self.model_config):
|
||||
mm_kwargs = mm_data
|
||||
else:
|
||||
mm_kwargs = self.multi_modal_input_mapper(
|
||||
mm_data,
|
||||
seq_group_metadata.mm_processor_kwargs,
|
||||
)
|
||||
|
||||
multi_modal_kwargs_list.append(mm_kwargs)
|
||||
|
||||
for modality, placeholder_map in placeholder_maps.items():
|
||||
multi_modal_placeholder_maps[modality].extend(
|
||||
placeholder_map, )
|
||||
|
||||
max_query_len = max(query_lens)
|
||||
assert max_query_len > 0, "query_lens: {}".format(query_lens)
|
||||
|
||||
input_tokens = torch.tensor(input_tokens,
|
||||
dtype=torch.long,
|
||||
device=self.device) # type: ignore
|
||||
input_positions = torch.tensor(input_positions,
|
||||
dtype=torch.long,
|
||||
device=self.device) # type: ignore
|
||||
|
||||
past_lens_tensor = torch.tensor(past_lens,
|
||||
dtype=torch.int32,
|
||||
device=self.device) # type: ignore
|
||||
subsequence_begins_tensor = torch.tensor(
|
||||
subsequence_begins, dtype=torch.int32,
|
||||
device=self.device) # type: ignore
|
||||
block_indices_tensor = torch.tensor(block_indices,
|
||||
dtype=torch.int32,
|
||||
device=self.device) # type: ignore
|
||||
block_indices_begins_tensor = torch.tensor(
|
||||
block_indices_begins, dtype=torch.int32,
|
||||
device=self.device) # type: ignore
|
||||
|
||||
max_context_len = max(seq_lens)
|
||||
max_context_len_tensor = torch.tensor(
|
||||
max_context_len, dtype=torch.int32,
|
||||
device=self.device) # type: ignore
|
||||
|
||||
placeholder_index_maps = {
|
||||
modality: placeholder_map.index_map()
|
||||
for modality, placeholder_map in
|
||||
multi_modal_placeholder_maps.items()
|
||||
}
|
||||
|
||||
attn_metadata = self.attn_backend.make_openvino_metadata(
|
||||
past_lens=past_lens_tensor,
|
||||
subsequence_begins=subsequence_begins_tensor,
|
||||
block_indices=block_indices_tensor,
|
||||
block_indices_begins=block_indices_begins_tensor,
|
||||
max_context_len=max_context_len_tensor,
|
||||
multi_modal_placeholder_index_maps=placeholder_index_maps,
|
||||
enable_kv_scales_calculation=False,
|
||||
)
|
||||
|
||||
multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
|
||||
|
||||
return ModelInput(
|
||||
input_tokens,
|
||||
input_positions,
|
||||
attn_metadata,
|
||||
seq_lens,
|
||||
query_lens,
|
||||
multi_modal_kwargs=multi_modal_kwargs,
|
||||
)
|
||||
|
||||
def prepare_input_tensors(
|
||||
self,
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata,
|
||||
SamplingMetadata, BatchedTensorInputs]:
|
||||
# Prepare input tensors.
|
||||
(
|
||||
input_tokens,
|
||||
input_positions,
|
||||
attn_metadata,
|
||||
seq_lens,
|
||||
query_lens,
|
||||
multi_modal_kwargs,
|
||||
) = self._prepare_model_input(seq_group_metadata_list)
|
||||
|
||||
sampling_metadata = SamplingMetadata.prepare(
|
||||
seq_group_metadata_list,
|
||||
seq_lens,
|
||||
query_lens,
|
||||
self.device,
|
||||
pin_memory=False,
|
||||
)
|
||||
|
||||
return (
|
||||
input_tokens,
|
||||
input_positions,
|
||||
attn_metadata,
|
||||
sampling_metadata,
|
||||
multi_modal_kwargs,
|
||||
)
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_model(
|
||||
self,
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
kv_caches: List[Tuple["ov.Tensor", "ov.Tensor"]],
|
||||
) -> Optional[SamplerOutput]:
|
||||
(
|
||||
input_tokens,
|
||||
input_positions,
|
||||
attn_metadata,
|
||||
sampling_metadata,
|
||||
multi_modal_kwargs,
|
||||
) = self.prepare_input_tensors(seq_group_metadata_list)
|
||||
|
||||
model_executable = self.model
|
||||
execute_model_kwargs = {
|
||||
"input_ids":
|
||||
input_tokens,
|
||||
"positions":
|
||||
input_positions,
|
||||
"kv_caches":
|
||||
kv_caches,
|
||||
**MultiModalKwargs.as_kwargs(multi_modal_kwargs or {},
|
||||
device=self.device),
|
||||
}
|
||||
|
||||
with set_forward_context(attn_metadata, self.vllm_config, 0):
|
||||
hidden_states = model_executable(**execute_model_kwargs)
|
||||
|
||||
# Compute the logits.
|
||||
logits = self.model.compute_logits(hidden_states, sampling_metadata)
|
||||
|
||||
# Sample the next token.
|
||||
output = self.model.sample(
|
||||
logits=logits,
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
return output
|
||||
|
||||
def prepare_model_input(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
def make_model_input_from_broadcasted_tensor_dict(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
@ -1,600 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""An OpenVINO worker class."""
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import openvino as ov
|
||||
import torch
|
||||
import torch.distributed
|
||||
import torch.nn as nn
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.attention import get_attn_backend
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||
ParallelConfig, VllmConfig)
|
||||
from vllm.distributed import (broadcast_tensor_dict,
|
||||
ensure_model_parallel_initialized,
|
||||
init_distributed_environment)
|
||||
from vllm.inputs import INPUT_REGISTRY
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor import set_random_seed
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
|
||||
from vllm.utils import bind_kv_cache
|
||||
from vllm.worker.openvino_model_runner import OpenVINOModelRunner
|
||||
from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class OpenVINOCacheEngine:
|
||||
"""Manages the KV cache for OpenVINO backend.
|
||||
|
||||
This class is responsible for initializing and managing CPU KV
|
||||
caches. It also provides methods for performing KV cache operations, such
|
||||
as copying.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cache_config: CacheConfig,
|
||||
model_config: ModelConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
device_config: DeviceConfig,
|
||||
ov_core: ov.Core,
|
||||
ov_device: str,
|
||||
) -> None:
|
||||
assert device_config.device_type == "openvino"
|
||||
self.cache_config = cache_config
|
||||
self.model_config = model_config
|
||||
self.parallel_config = parallel_config
|
||||
|
||||
self.head_size = model_config.get_head_size()
|
||||
if device_config.device.type == "cpu" and \
|
||||
cache_config.cache_dtype == ov.Type.u8:
|
||||
# Scale, zero point and quantized data will be stored together.
|
||||
# The layout for per token per head:
|
||||
# |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501
|
||||
# so, we have to extend head_size by 8, which is sizeof(float)
|
||||
# for scale and sizeof(float) for zeropoint
|
||||
self.head_size += 8
|
||||
self.num_layers = model_config.get_num_layers(parallel_config)
|
||||
self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
|
||||
|
||||
self.block_size = cache_config.block_size
|
||||
# Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks
|
||||
# for OpenVINO backend with a CPU target device, because we want
|
||||
# to reuse KV cache management in the scheduler.
|
||||
self.num_device_blocks = cache_config.num_gpu_blocks
|
||||
self.num_swap_blocks = cache_config.num_cpu_blocks
|
||||
|
||||
# Get attention backend.
|
||||
self.attn_backend = get_attn_backend(
|
||||
self.head_size,
|
||||
self.model_config.dtype,
|
||||
self.cache_config.cache_dtype,
|
||||
self.block_size,
|
||||
self.model_config.is_attention_free,
|
||||
)
|
||||
|
||||
# Initialize the cache.
|
||||
self.kv_cache: List[Tuple[ov.Tensor,
|
||||
ov.Tensor]] = self._allocate_kv_cache(
|
||||
self.num_device_blocks, ov_core,
|
||||
ov_device)
|
||||
|
||||
# Initialize the swap.
|
||||
self.swap_cache: List[Tuple[ov.Tensor,
|
||||
ov.Tensor]] = self._allocate_swap_cache(
|
||||
self.num_swap_blocks, ov_device)
|
||||
|
||||
def _allocate_kv_cache(
|
||||
self,
|
||||
num_blocks: int,
|
||||
ov_core: ov.Core,
|
||||
ov_device: str,
|
||||
) -> List[Tuple[ov.Tensor, ov.Tensor]]:
|
||||
"""Allocates KV cache."""
|
||||
k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape(
|
||||
num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:]
|
||||
kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = []
|
||||
|
||||
if current_platform.is_openvino_cpu():
|
||||
for _ in range(self.num_layers):
|
||||
key_blocks = ov.Tensor(self.cache_config.cache_dtype,
|
||||
k_block_shape)
|
||||
value_blocks = ov.Tensor(self.cache_config.cache_dtype,
|
||||
v_block_shape)
|
||||
kv_cache.append((key_blocks, value_blocks))
|
||||
else:
|
||||
# Update key_cache shape:
|
||||
k_block_shape = (v_block_shape[0], v_block_shape[1],
|
||||
v_block_shape[3], v_block_shape[2])
|
||||
|
||||
remote_context = ov_core.get_default_context(ov_device)
|
||||
|
||||
for _ in range(self.num_layers):
|
||||
key_blocks = \
|
||||
remote_context.create_tensor(self.cache_config.cache_dtype,
|
||||
ov.Shape(k_block_shape),
|
||||
{})
|
||||
|
||||
value_blocks = \
|
||||
remote_context.create_tensor(self.cache_config.cache_dtype,
|
||||
ov.Shape(v_block_shape),
|
||||
{})
|
||||
|
||||
kv_cache.append((key_blocks, value_blocks))
|
||||
|
||||
return kv_cache
|
||||
|
||||
def _allocate_swap_cache(
|
||||
self,
|
||||
num_blocks: int,
|
||||
ov_device: str,
|
||||
) -> List[Tuple[ov.Tensor, ov.Tensor]]:
|
||||
"""Allocates swap cache."""
|
||||
k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape(
|
||||
num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:]
|
||||
swap_cache: List[Tuple[ov.Tensor, ov.Tensor]] = []
|
||||
|
||||
if num_blocks == 0:
|
||||
return swap_cache
|
||||
|
||||
assert not current_platform.is_openvino_cpu(), \
|
||||
"CPU device isn't supposed to have swap cache"
|
||||
|
||||
# Update key_cache shape:
|
||||
k_block_shape = (v_block_shape[0], v_block_shape[1], v_block_shape[3],
|
||||
v_block_shape[2])
|
||||
|
||||
for _ in range(self.num_layers):
|
||||
key_blocks = ov.Tensor(self.cache_config.cache_dtype,
|
||||
k_block_shape)
|
||||
value_blocks = ov.Tensor(self.cache_config.cache_dtype,
|
||||
v_block_shape)
|
||||
swap_cache.append((key_blocks, value_blocks))
|
||||
|
||||
return swap_cache
|
||||
|
||||
def swap_in(self, src_to_dst: List[Tuple[int, int]]) -> None:
|
||||
for i in range(self.num_layers):
|
||||
for swap_tensor, kv_tensor in zip(self.swap_cache[i],
|
||||
self.kv_cache[i]):
|
||||
self.attn_backend.swap_blocks(swap_tensor, kv_tensor,
|
||||
src_to_dst)
|
||||
|
||||
def swap_out(self, src_to_dst: List[Tuple[int, int]]) -> None:
|
||||
for i in range(self.num_layers):
|
||||
for swap_tensor, kv_tensor in zip(self.swap_cache[i],
|
||||
self.kv_cache[i]):
|
||||
self.attn_backend.swap_blocks(kv_tensor, swap_tensor,
|
||||
src_to_dst)
|
||||
|
||||
def copy(self, src_to_dsts: List[Tuple[int, int]]) -> None:
|
||||
if (len(src_to_dsts) > 0):
|
||||
self.attn_backend.copy_blocks(self.kv_cache, src_to_dsts)
|
||||
|
||||
@staticmethod
|
||||
def get_cache_block_size(
|
||||
block_size: int,
|
||||
cache_dtype: ov.Type,
|
||||
model_config: ModelConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
) -> int:
|
||||
head_size = model_config.get_head_size()
|
||||
num_kv_heads = model_config.get_num_kv_heads(parallel_config)
|
||||
num_layers = model_config.get_num_layers(parallel_config)
|
||||
|
||||
if cache_dtype == ov.Type.u8:
|
||||
# Scale, zero point and quantized data will be stored together.
|
||||
# The layout for per token per head:
|
||||
# |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501
|
||||
# so, we have to extend head_size by 8, which is sizeof(float)
|
||||
# for scale and sizeof(float) for zeropoint
|
||||
head_size += 8
|
||||
|
||||
key_cache_block = block_size * num_kv_heads * head_size
|
||||
value_cache_block = key_cache_block
|
||||
total = num_layers * (key_cache_block + value_cache_block)
|
||||
dtype_size = cache_dtype.size
|
||||
return dtype_size * total
|
||||
|
||||
|
||||
class OpenVINOWorker(LoRANotSupportedWorkerBase):
|
||||
"""A worker class that executes the model on OpenVINO backend.
|
||||
|
||||
Each worker is associated with a single OpenVINO device. The worker is
|
||||
responsible for maintaining the KV cache and executing the model on the
|
||||
OpenVINO backend.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vllm_config: VllmConfig,
|
||||
local_rank: int,
|
||||
rank: int,
|
||||
distributed_init_method: str,
|
||||
is_driver_worker: bool = False,
|
||||
) -> None:
|
||||
WorkerBase.__init__(self, vllm_config)
|
||||
self.ov_core = ov.Core()
|
||||
self.parallel_config.rank = rank
|
||||
self.local_rank = local_rank
|
||||
self.rank = rank
|
||||
self.distributed_init_method = distributed_init_method
|
||||
self.is_driver_worker = is_driver_worker
|
||||
if self.is_driver_worker:
|
||||
assert self.rank == 0, "The driver worker must have rank 0."
|
||||
|
||||
if self.model_config.trust_remote_code:
|
||||
# note: lazy import to avoid importing torch before initializing
|
||||
from vllm.utils import init_cached_hf_modules
|
||||
|
||||
init_cached_hf_modules()
|
||||
self.model_runner = OpenVINOModelRunner(
|
||||
self.ov_core,
|
||||
vllm_config=self.vllm_config,
|
||||
kv_cache_dtype=self.vllm_config.cache_config.cache_dtype,
|
||||
is_driver_worker=is_driver_worker,
|
||||
)
|
||||
# Uninitialized cache engine. Will be initialized by
|
||||
# initialize_cache.
|
||||
self.cache_engine: OpenVINOCacheEngine
|
||||
self.kv_cache: List[Tuple[ov.Tensor, ov.Tensor]]
|
||||
|
||||
def init_device(self) -> None:
|
||||
self.init_distributed_environment()
|
||||
# Set random seed.
|
||||
set_random_seed(self.model_config.seed)
|
||||
|
||||
def load_model(self):
|
||||
self.model_runner.load_model()
|
||||
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of blocks available for the KV cache.
|
||||
|
||||
This determines how many KV blocks can fit into the configured
|
||||
KV cache space.
|
||||
"""
|
||||
# For OpenVINO backend, in case of CPU device, the block number will be
|
||||
# calculated based on the openvino_kvcache_space_bytes.
|
||||
cache_block_size = self.get_cache_block_size_bytes()
|
||||
kvcache_space_bytes = self.cache_config.openvino_kvcache_space_bytes
|
||||
|
||||
if current_platform.is_openvino_cpu():
|
||||
num_device_blocks = int(kvcache_space_bytes // cache_block_size)
|
||||
num_swap_blocks = 0
|
||||
else:
|
||||
if kvcache_space_bytes > 0:
|
||||
logger.info("KV_CACHE size was explicitly configured via "
|
||||
"VLLM_OPENVINO_KVCACHE_SPACE environment "
|
||||
"variable, ignoring profiling run.")
|
||||
kv_cache_size = kvcache_space_bytes
|
||||
else:
|
||||
try:
|
||||
kv_cache_size = self.profile_run()
|
||||
except Exception as err:
|
||||
raise RuntimeError(
|
||||
"The error occurred during profile run. This might be "
|
||||
"due to insufficient GPU memory. Consider decreasing "
|
||||
"`max_model_len` to limit the maximum simultaneously "
|
||||
"processed tokens.") from err
|
||||
|
||||
num_device_blocks = int(kv_cache_size // cache_block_size)
|
||||
num_swap_blocks = int(self.cache_config.swap_space_bytes //
|
||||
cache_block_size)
|
||||
|
||||
return num_device_blocks, num_swap_blocks
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int,
|
||||
num_cpu_blocks: int) -> None:
|
||||
"""Initialize the KV cache. Swappable CPU memory is only
|
||||
supported on GPU.
|
||||
|
||||
For CPU, we use the num_gpu_blocks to
|
||||
determine how many non-swappable CPU blocks to allocate.
|
||||
"""
|
||||
|
||||
num_device_blocks = num_gpu_blocks
|
||||
num_swap_blocks = num_cpu_blocks
|
||||
|
||||
if current_platform.is_openvino_cpu():
|
||||
assert (num_swap_blocks == 0
|
||||
), f"{type(self)} does not support swappable cache for CPU"
|
||||
|
||||
self._validate_num_blocks(num_device_blocks)
|
||||
self.cache_config.num_gpu_blocks = num_device_blocks
|
||||
self.cache_config.num_cpu_blocks = num_swap_blocks
|
||||
|
||||
# Initialize the cache.
|
||||
self._init_cache_engine()
|
||||
|
||||
def _validate_num_blocks(self, num_blocks: int) -> None:
|
||||
"""Raise errors if the num_blocks is invalid."""
|
||||
if num_blocks <= 0:
|
||||
raise ValueError(
|
||||
"No available memory for the cache blocks. "
|
||||
"Try increasing `VLLM_OPENVINO_KVCACHE_SPACE` when "
|
||||
"initializing the engine.")
|
||||
|
||||
max_seq_len = self.cache_config.block_size * num_blocks
|
||||
if self.model_config.max_model_len > max_seq_len:
|
||||
raise ValueError(
|
||||
f"The model's max seq len ({self.model_config.max_model_len}) "
|
||||
"is larger than the maximum number of tokens that can be "
|
||||
f"stored in KV cache ({max_seq_len}). Try increasing "
|
||||
"`VLLM_OPENVINO_KVCACHE_SPACE` or decreasing `max_model_len` "
|
||||
"when initializing the engine.")
|
||||
|
||||
def _init_cache_engine(self) -> None:
|
||||
ov_device = envs.VLLM_OPENVINO_DEVICE
|
||||
self.cache_engine = OpenVINOCacheEngine(
|
||||
self.cache_config,
|
||||
self.model_config,
|
||||
self.parallel_config,
|
||||
self.device_config,
|
||||
self.ov_core,
|
||||
ov_device,
|
||||
)
|
||||
self.kv_cache = self.cache_engine.kv_cache
|
||||
bind_kv_cache(self.compilation_config.static_forward_context,
|
||||
[self.kv_cache])
|
||||
self.model_runner.block_size = self.cache_engine.block_size
|
||||
|
||||
assert self.kv_cache is not None
|
||||
|
||||
# Populate the cache to warmup the memory
|
||||
if current_platform.is_openvino_cpu():
|
||||
for key_cache, value_cache in self.kv_cache:
|
||||
key_cache.data[:] = 0
|
||||
value_cache.data[:] = 0
|
||||
|
||||
def cache_swap_in(self, src_to_dst: List[Tuple[int, int]]) -> None:
|
||||
self.cache_engine.swap_in(src_to_dst)
|
||||
|
||||
def cache_swap_out(self, src_to_dst: List[Tuple[int, int]]) -> None:
|
||||
self.cache_engine.swap_out(src_to_dst)
|
||||
|
||||
def cache_copy(
|
||||
self,
|
||||
blocks_to_copy: List[Tuple[int, int]],
|
||||
) -> None:
|
||||
self.cache_engine.copy(blocks_to_copy) # type: ignore
|
||||
|
||||
def get_model(self) -> nn.Module:
|
||||
return self.model_runner.get_model()
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_model(
|
||||
self,
|
||||
execute_model_req: Optional[ExecuteModelRequest] = None,
|
||||
) -> List[SamplerOutput]:
|
||||
if execute_model_req is None:
|
||||
seq_group_metadata_list = None
|
||||
else:
|
||||
seq_group_metadata_list = execute_model_req.seq_group_metadata_list
|
||||
|
||||
if self.is_driver_worker:
|
||||
assert seq_group_metadata_list is not None
|
||||
num_seq_groups: int = len(seq_group_metadata_list)
|
||||
assert execute_model_req is not None
|
||||
blocks_to_copy = execute_model_req.blocks_to_copy
|
||||
blocks_to_swap_in = execute_model_req.blocks_to_swap_in
|
||||
blocks_to_swap_out = execute_model_req.blocks_to_swap_out
|
||||
data: Dict[str, Any] = {
|
||||
"num_seq_groups": num_seq_groups,
|
||||
"blocks_to_copy": execute_model_req.blocks_to_copy,
|
||||
"blocks_to_swap_in": execute_model_req.blocks_to_swap_in,
|
||||
"blocks_to_swap_out": execute_model_req.blocks_to_swap_out,
|
||||
}
|
||||
broadcast_tensor_dict(data, src=0)
|
||||
else:
|
||||
data = broadcast_tensor_dict(src=0)
|
||||
num_seq_groups = data["num_seq_groups"]
|
||||
blocks_to_copy = data["blocks_to_copy"]
|
||||
blocks_to_swap_in = data["blocks_to_swap_in"]
|
||||
blocks_to_swap_out = data["blocks_to_swap_out"]
|
||||
|
||||
if current_platform.is_openvino_cpu():
|
||||
assert len(execute_model_req.blocks_to_swap_in) == 0
|
||||
assert len(execute_model_req.blocks_to_swap_out) == 0
|
||||
else:
|
||||
self.cache_swap_in(blocks_to_swap_in)
|
||||
self.cache_swap_out(blocks_to_swap_out)
|
||||
|
||||
self.cache_copy(blocks_to_copy)
|
||||
|
||||
# If there is no input, we don't need to execute the model.
|
||||
if num_seq_groups == 0:
|
||||
return []
|
||||
|
||||
output = self.model_runner.execute_model(seq_group_metadata_list,
|
||||
self.kv_cache)
|
||||
|
||||
# OpenVINO worker only supports single-step execution.
|
||||
return [output]
|
||||
|
||||
def init_distributed_environment(self) -> None:
|
||||
"""Initialize the distributed environment."""
|
||||
|
||||
parallel_config = self.parallel_config
|
||||
rank = self.rank
|
||||
distributed_init_method = self.distributed_init_method
|
||||
init_distributed_environment(
|
||||
world_size=parallel_config.world_size,
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
backend="gloo",
|
||||
)
|
||||
|
||||
# A small all_reduce for warmup.
|
||||
torch.distributed.all_reduce(torch.zeros(1).cpu())
|
||||
|
||||
ensure_model_parallel_initialized(
|
||||
parallel_config.tensor_parallel_size,
|
||||
parallel_config.pipeline_parallel_size,
|
||||
)
|
||||
|
||||
def get_cache_block_size_bytes(self) -> int:
|
||||
"""Return the size in bytes of a single KV cache block."""
|
||||
return OpenVINOCacheEngine.get_cache_block_size(
|
||||
self.cache_config.block_size,
|
||||
self.cache_config.cache_dtype,
|
||||
self.model_config,
|
||||
self.parallel_config,
|
||||
)
|
||||
|
||||
def profile_run(self) -> int:
|
||||
ov_device = envs.VLLM_OPENVINO_DEVICE
|
||||
|
||||
assert not current_platform.is_openvino_cpu(), \
|
||||
"CPU device isn't supposed to use profile run."
|
||||
|
||||
import openvino.properties.device as device
|
||||
import openvino.properties.intel_gpu as intel_gpu
|
||||
|
||||
ov_core = self.ov_core
|
||||
cache_config = self.cache_config
|
||||
model_config = self.model_config
|
||||
parallel_config = self.parallel_config
|
||||
device_config = self.device_config
|
||||
input_registry = INPUT_REGISTRY
|
||||
mm_registry = MULTIMODAL_REGISTRY
|
||||
mm_registry.init_mm_limits_per_prompt(model_config)
|
||||
|
||||
# Execute a forward pass with dummy inputs to profile the memory usage
|
||||
# of the model.
|
||||
def model_profile_run():
|
||||
top_k = model_config.get_vocab_size() - 1
|
||||
sampling_params = SamplingParams(top_p=0.99, top_k=top_k)
|
||||
|
||||
max_num_batched_tokens = \
|
||||
self.scheduler_config.max_num_batched_tokens
|
||||
max_num_seqs = self.scheduler_config.max_num_seqs
|
||||
tmp_cache_config = CacheConfig(cache_config.block_size,
|
||||
cache_config.gpu_memory_utilization,
|
||||
cache_config.swap_space_bytes,
|
||||
"auto")
|
||||
tmp_cache_config.num_gpu_blocks = 1
|
||||
tmp_cache_config.num_cpu_blocks = 0
|
||||
tmp_cache_config.cache_dtype = cache_config.cache_dtype
|
||||
|
||||
profiling_cache_engine = OpenVINOCacheEngine(
|
||||
tmp_cache_config, model_config, parallel_config, device_config,
|
||||
ov_core, ov_device)
|
||||
|
||||
# Profile memory usage with max_num_sequences sequences and the
|
||||
# total # number of tokens equal to max_num_batched_tokens.
|
||||
seqs: List[SequenceGroupMetadata] = []
|
||||
for group_id in range(max_num_seqs):
|
||||
seq_len = (max_num_batched_tokens // max_num_seqs +
|
||||
(group_id < max_num_batched_tokens % max_num_seqs))
|
||||
block_size = cache_config.block_size
|
||||
seq_num_blocks = (seq_len + block_size - 1) // block_size
|
||||
|
||||
dummy_data = input_registry \
|
||||
.dummy_data_for_profiling(model_config,
|
||||
seq_len,
|
||||
mm_registry)
|
||||
|
||||
block_tables = [[0] * seq_num_blocks] * max_num_seqs
|
||||
seq = SequenceGroupMetadata(
|
||||
request_id=str(group_id),
|
||||
is_prompt=True,
|
||||
seq_data={group_id: dummy_data.seq_data},
|
||||
sampling_params=sampling_params,
|
||||
block_tables=block_tables,
|
||||
lora_request=None,
|
||||
multi_modal_data=dummy_data.multi_modal_data)
|
||||
seqs.append(seq)
|
||||
|
||||
self.model_runner.block_size = tmp_cache_config.block_size
|
||||
|
||||
bind_kv_cache(self.compilation_config.static_forward_context,
|
||||
profiling_cache_engine.kv_cache)
|
||||
# Run the model with the dummy inputs.
|
||||
self.model_runner.execute_model(seqs,
|
||||
profiling_cache_engine.kv_cache)
|
||||
|
||||
# Explicitly revert bind_kv_cache and delete temporary KV cache
|
||||
# manager to free KV cache when real inputs will be passed to OV
|
||||
bind_kv_cache(self.compilation_config.static_forward_context, [[
|
||||
torch.tensor([])
|
||||
for _ in range(len(profiling_cache_engine.kv_cache))
|
||||
]])
|
||||
del profiling_cache_engine
|
||||
|
||||
logger.info(
|
||||
"Start profiling run with dummy inputs to evaluate "
|
||||
"memory usage for %s. It might take a while.", ov_device)
|
||||
|
||||
model_profile_run()
|
||||
|
||||
gpu_device_type = ov_core.get_property(ov_device, device.type)
|
||||
memory_statistics = \
|
||||
ov_core.get_property(ov_device, intel_gpu.memory_statistics)
|
||||
memory_utilization = cache_config.gpu_memory_utilization
|
||||
|
||||
if gpu_device_type == device.Type.INTEGRATED and \
|
||||
memory_utilization >= 0.9:
|
||||
logger.warning(
|
||||
"iGPU is used with high gpu_memory_utilization=%f "
|
||||
"value. This may cause low performance due to "
|
||||
"occupying the majority of available system "
|
||||
"memory. Please consider decreasing "
|
||||
"gpu_memory_utilization or explicitly setting "
|
||||
"`VLLM_OPENVINO_KVCACHE_SPACE` (GB) environment "
|
||||
"variable.", memory_utilization)
|
||||
|
||||
# sum up all used device memory
|
||||
device_memory_types = ["cl_mem", "usm_device"]
|
||||
used_device_mem = \
|
||||
sum(memory_statistics.get(key, 0) for key in device_memory_types)
|
||||
|
||||
if gpu_device_type == device.Type.INTEGRATED:
|
||||
used_device_mem += memory_statistics.get("usm_host", 0)
|
||||
|
||||
# there could be unaccounted extra memory reserved by kernels, kept
|
||||
# in memory pools, etc
|
||||
# therefore, add a threshold to account for this
|
||||
used_memory_threshold = 1.1
|
||||
used_device_mem *= used_memory_threshold
|
||||
|
||||
total_device_memory = \
|
||||
ov_core.get_property(ov_device, intel_gpu.device_total_mem_size)
|
||||
|
||||
def format_memory_size(size) -> str:
|
||||
units = ["B", "KB", "MB", "GB"]
|
||||
unit_index = 0
|
||||
|
||||
while size > 1024 and unit_index < len(units) - 1:
|
||||
size /= 1024
|
||||
unit_index += 1
|
||||
|
||||
return f"{size:.2f} {units[unit_index]}"
|
||||
|
||||
total_device_memory_str = \
|
||||
format(format_memory_size(total_device_memory))
|
||||
used_device_memory_str = \
|
||||
format(format_memory_size(used_device_mem))
|
||||
|
||||
logger.info(
|
||||
"Total %s memory: %s. "
|
||||
"Amount of memory required to run the model with "
|
||||
"max_num_batched_tokens=%d: %s.", ov_device,
|
||||
total_device_memory_str,
|
||||
self.scheduler_config.max_num_batched_tokens,
|
||||
used_device_memory_str)
|
||||
|
||||
if used_device_mem >= total_device_memory:
|
||||
raise RuntimeError(
|
||||
f"The required memory size {used_device_memory_str} for model "
|
||||
"is higher than the total available device "
|
||||
"memory {total_device_memory_str}. Please consider to "
|
||||
"decrease `max_num_batched_tokens` or increase "
|
||||
"`gpu_memory_utilization`")
|
||||
|
||||
return total_device_memory * memory_utilization - used_device_mem
|
||||
Loading…
x
Reference in New Issue
Block a user