From 06dd08256f076689945418cd61397c1759f4abfa Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Fri, 21 Mar 2025 08:44:37 +0800 Subject: [PATCH 01/40] Enforce that TP > 1 is not supported for Mamba2 if Quantization is Enabled. (#14617) Signed-off-by: Yu Chin Fabian Lim --- .../layers/mamba/mamba_mixer2.py | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 53d68b60f2fde..fec6d6112d665 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -251,6 +251,9 @@ class MambaMixer2(CustomOp): "then num_groups must equal 1." ) + assert self.tp_size == 1 or quant_config is None, \ + "Tensor parallel currently not supported for quantized models." + self.ssm_state_size = ssm_state_size self.activation = activation @@ -331,22 +334,24 @@ class MambaMixer2(CustomOp): ], self.tp_size, tp_rank) }) - delattr(self.in_proj.weight, "weight_loader") - set_weight_attrs( - self.in_proj.weight, - { - "weight_loader": - mamba_v2_sharded_weight_loader( - [ - intermediate_settings, # for gate - intermediate_settings, - group_shard_settings, - group_shard_settings, - head_setings, # for dt - ], - self.tp_size, - tp_rank) - }) + if quant_config is None: + # - quant layers do not have a weight loader + delattr(self.in_proj.weight, "weight_loader") + set_weight_attrs( + self.in_proj.weight, + { + "weight_loader": + mamba_v2_sharded_weight_loader( + [ + intermediate_settings, # for gate + intermediate_settings, + group_shard_settings, + group_shard_settings, + head_setings, # for dt + ], + self.tp_size, + tp_rank) + }) # - these are TPed by heads to reduce the size of the # temporal shape From 0c6f5023c390075e842bb7c70bb8f5aa433c584c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 20 Mar 2025 17:50:43 -0700 Subject: [PATCH 02/40] [V1] Scheduler Refactoring [1/N] - Add Scheduler Interface (#15250) Signed-off-by: Woosuk Kwon Co-authored-by: Cody Yu Co-authored-by: Nick Hill --- tests/plugins_tests/test_scheduler_plugins.py | 2 +- tests/v1/core/test_scheduler.py | 3 +- tests/v1/worker/test_gpu_model_runner.py | 4 +- vllm/engine/arg_utils.py | 2 +- vllm/executor/ray_utils.py | 2 +- vllm/v1/attention/backends/flash_attn.py | 2 +- vllm/v1/attention/backends/mla/common.py | 2 +- vllm/v1/core/sched/__init__.py | 0 vllm/v1/core/sched/interface.py | 139 ++++++++++++++++++ .../{scheduler_output.py => sched/output.py} | 0 vllm/v1/core/{ => sched}/scheduler.py | 37 +---- vllm/v1/core/sched/utils.py | 22 +++ vllm/v1/engine/core.py | 4 +- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/gpu_worker.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 2 +- vllm/v1/worker/tpu_worker.py | 2 +- 17 files changed, 182 insertions(+), 45 deletions(-) create mode 100644 vllm/v1/core/sched/__init__.py create mode 100644 vllm/v1/core/sched/interface.py rename vllm/v1/core/{scheduler_output.py => sched/output.py} (100%) rename vllm/v1/core/{ => sched}/scheduler.py (96%) create mode 100644 vllm/v1/core/sched/utils.py diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py index 7abf5066a4133..4c95a52a967bd 100644 --- a/tests/plugins_tests/test_scheduler_plugins.py +++ b/tests/plugins_tests/test_scheduler_plugins.py @@ -6,7 +6,7 @@ from vllm.core.scheduler import Scheduler from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.sampling_params import SamplingParams -from vllm.v1.core.scheduler import Scheduler as V1Scheduler +from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 9413373390fe2..8916aa580000a 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -6,7 +6,8 @@ import pytest from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.sampling_params import SamplingParams -from vllm.v1.core.scheduler import Scheduler, SchedulerOutput +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 345519a07e411..dd95a7f53064e 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -3,8 +3,8 @@ import pytest from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.sampling_params import SamplingParams -from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData, - SchedulerOutput) +from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, + SchedulerOutput) from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu_model_runner import GPUModelRunner diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 5015f1d684b76..bbe780a0ec118 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1695,7 +1695,7 @@ class EngineArgs: # V1 should use the new scheduler by default. # Swap it only if this arg is set to the original V0 default if self.scheduler_cls == EngineArgs.scheduler_cls: - self.scheduler_cls = "vllm.v1.core.scheduler.Scheduler" + self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler" # When no user override, set the default values based on the usage # context. diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index c1bf2fb316d9b..a7042ca8df17c 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -17,7 +17,7 @@ from vllm.utils import get_ip from vllm.worker.worker_base import WorkerWrapperBase if TYPE_CHECKING: - from vllm.v1.core.scheduler import SchedulerOutput + from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import ModelRunnerOutput logger = init_logger(__name__) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 637c01556ac1c..27b3aabbc3504 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -17,7 +17,7 @@ from vllm.platforms import current_platform from vllm.utils import cdiv if TYPE_CHECKING: - from vllm.v1.core.scheduler_output import SchedulerOutput + from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index f801745ab5c7d..188a425b107e4 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -212,7 +212,7 @@ except ImportError: from flash_attn import flash_attn_varlen_func if TYPE_CHECKING: - from vllm.v1.core.scheduler_output import SchedulerOutput + from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner diff --git a/vllm/v1/core/sched/__init__.py b/vllm/v1/core/sched/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py new file mode 100644 index 0000000000000..bfed44f9d58c8 --- /dev/null +++ b/vllm/v1/core/sched/interface.py @@ -0,0 +1,139 @@ +# SPDX-License-Identifier: Apache-2.0 +from abc import ABC, abstractmethod +from collections.abc import Iterable +from typing import TYPE_CHECKING, Optional, Union + +if TYPE_CHECKING: + from vllm.v1.core.sched.output import SchedulerOutput + from vllm.v1.engine import EngineCoreOutputs + from vllm.v1.metrics.stats import SchedulerStats + from vllm.v1.outputs import ModelRunnerOutput + from vllm.v1.request import Request, RequestStatus + + +class SchedulerInterface(ABC): + + @abstractmethod + def schedule(self) -> "SchedulerOutput": + """Schedule the requests to process in this scheduling step. + + The scheduling decision is made at the iteration level. Each scheduling + step corresponds to a single forward pass of the model. Therefore, this + method is called repeatedly by a busy loop in the engine. + + Essentially, the scheduler produces a dictionary of {req_id: num_tokens} + that specifies how many tokens to process for each request in this + scheduling step. For example, num_tokens can be as large as the number + of prompt tokens for new requests, or it can be 1 for the requests that + are auto-regressively generating new tokens one by one. Otherwise, it + can be somewhere in between in case of chunked prefills, prefix caching, + speculative decoding, etc. + + Additionally, the scheduler also returns useful data about each request + or the batch as a whole. The model runner will use this information in + preparing inputs to the model. + + Returns: + A SchedulerOutput object containing information about the scheduled + requests. + """ + raise NotImplementedError + + @abstractmethod + def update_from_output( + self, + scheduler_output: "SchedulerOutput", + model_runner_output: "ModelRunnerOutput", + ) -> "EngineCoreOutputs": + """Update the scheduler state based on the model runner output. + + This method is called after the model runner has processed the scheduled + requests. The model runner output includes generated token ids, draft + token ids for next step, etc. The scheduler uses this information to + update its states, checks the finished requests, and returns the output + for each request. + + Returns: + A EngineCoreOutputs object containing the outputs for each request. + """ + raise NotImplementedError + + @abstractmethod + def add_request(self, request: "Request") -> None: + """Add a new request to the scheduler's internal queue. + + Args: + request: The new request being added. + """ + raise NotImplementedError + + @abstractmethod + def finish_requests( + self, + request_ids: Union[str, Iterable[str]], + finished_status: "RequestStatus", + ) -> None: + """Finish the requests in the scheduler's internal queue. If the request + is not in the queue, this method will do nothing. + + This method is called in two cases: + 1. When the request is aborted by the client. + 2. When the frontend process detects a stop string of the request after + de-tokenizing its generated tokens. + + Args: + request_ids: A single or a list of request IDs. + finished_status: The finished status of the given requests. + """ + raise NotImplementedError + + @abstractmethod + def get_num_unfinished_requests(self) -> int: + """Number of unfinished requests in the scheduler's internal queue.""" + raise NotImplementedError + + def has_unfinished_requests(self) -> bool: + """Returns True if there are unfinished requests in the scheduler's + internal queue.""" + return self.get_num_unfinished_requests() > 0 + + @abstractmethod + def has_finished_requests(self) -> bool: + """Returns True if there are finished requests that need to be cleared. + NOTE: This is different from `not self.has_unfinished_requests()`. + + The scheduler maintains an internal list of the requests finished in the + previous step. This list is returned from the next call to schedule(), + to be sent to the model runner in the next step to clear cached states + for these finished requests. + + This method checks if this internal list of finished requests is + non-empty. This information is useful for DP attention. + """ + raise NotImplementedError + + def has_requests(self) -> bool: + """Returns True if there are unfinished requests, or finished requests + not yet returned in SchedulerOutputs.""" + return self.has_unfinished_requests() or self.has_finished_requests() + + @abstractmethod + def get_num_unscheduled_requests(self) -> int: + """Number of requests that are not being processed by the executor.""" + raise NotImplementedError + + @abstractmethod + def reset_prefix_cache(self) -> bool: + """Reset the prefix cache for KV cache. + + This is particularly required when the model weights are live-updated. + """ + raise NotImplementedError + + @abstractmethod + def make_stats(self) -> Optional["SchedulerStats"]: + """Make a SchedulerStats object for logging. + + The SchedulerStats object is created for every scheduling step. + """ + raise NotImplementedError diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/sched/output.py similarity index 100% rename from vllm/v1/core/scheduler_output.py rename to vllm/v1/core/sched/output.py diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/sched/scheduler.py similarity index 96% rename from vllm/v1/core/scheduler.py rename to vllm/v1/core/sched/scheduler.py index 056458ef9dd28..d002a19b08a41 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -13,8 +13,10 @@ from vllm.logger import init_logger from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager, compute_encoder_budget) from vllm.v1.core.kv_cache_manager import KVCacheManager -from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData, - SchedulerOutput) +from vllm.v1.core.sched.interface import SchedulerInterface +from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, + SchedulerOutput) +from vllm.v1.core.sched.utils import check_stop from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs) from vllm.v1.metrics.stats import SchedulerStats @@ -25,7 +27,7 @@ from vllm.v1.structured_output import StructuredOutputManager logger = init_logger(__name__) -class Scheduler: +class Scheduler(SchedulerInterface): def __init__( self, @@ -602,7 +604,7 @@ class Scheduler: # Check for stop and update request state. # This must be called before we make the EngineCoreOutput. - stopped = self._check_stop(request) + stopped = check_stop(request, self.max_model_len) if stopped: self._free_request(request) break @@ -648,25 +650,6 @@ class Scheduler: scheduler_stats=self.make_stats(), ) - def _check_stop(self, request: Request) -> bool: - if (request.num_tokens >= self.max_model_len - or request.num_output_tokens >= request.max_tokens): - request.status = RequestStatus.FINISHED_LENGTH_CAPPED - return True - - sampling_params = request.sampling_params - last_token_id = request.output_token_ids[-1] - if (not sampling_params.ignore_eos - and last_token_id == request.eos_token_id): - request.status = RequestStatus.FINISHED_STOPPED - return True - - if last_token_id in (sampling_params.stop_token_ids or ()): - request.status = RequestStatus.FINISHED_STOPPED - request.stop_reason = last_token_id - return True - return False - def add_request(self, request: Request) -> None: self.waiting.append(request) self.requests[request.request_id] = request @@ -715,17 +698,9 @@ class Scheduler: def get_num_unfinished_requests(self) -> int: return len(self.waiting) + len(self.running) - def has_unfinished_requests(self) -> bool: - return self.get_num_unfinished_requests() > 0 - def has_finished_requests(self) -> bool: return len(self.finished_req_ids) > 0 - def has_requests(self): - """Returns True if there are unfinished requests, or finished requests - not yet returned in SchedulerOutputs.""" - return self.has_unfinished_requests() or self.has_finished_requests() - def get_num_unscheduled_requests(self) -> int: """Number of requests that are not being processed by the executor.""" return self.get_num_unfinished_requests() - len(self.scheduled_req_ids) diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py new file mode 100644 index 0000000000000..3a0028a59016e --- /dev/null +++ b/vllm/v1/core/sched/utils.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: Apache-2.0 +from vllm.v1.request import Request, RequestStatus + + +def check_stop(request: Request, max_model_len: int) -> bool: + if (request.num_tokens >= max_model_len + or request.num_output_tokens >= request.max_tokens): + request.status = RequestStatus.FINISHED_LENGTH_CAPPED + return True + + sampling_params = request.sampling_params + last_token_id = request.output_token_ids[-1] + if (not sampling_params.ignore_eos + and last_token_id == request.eos_token_id): + request.status = RequestStatus.FINISHED_STOPPED + return True + + if last_token_id in (sampling_params.stop_token_ids or ()): + request.status = RequestStatus.FINISHED_STOPPED + request.stop_reason = last_token_id + return True + return False diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index b0c18aee97c28..1598e6b8443fe 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -22,8 +22,8 @@ from vllm.transformers_utils.config import ( from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname, zmq_socket_ctx) from vllm.v1.core.kv_cache_utils import get_kv_cache_configs -from vllm.v1.core.scheduler import Scheduler as V1Scheduler -from vllm.v1.core.scheduler import SchedulerOutput +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, UtilityOutput) from vllm.v1.engine.mm_input_cache import MMInputCacheServer diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c82bcec25d245..b186300a00330 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -45,7 +45,7 @@ from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin if TYPE_CHECKING: import xgrammar as xgr - from vllm.v1.core.scheduler_output import SchedulerOutput + from vllm.v1.core.sched.output import SchedulerOutput else: xgr = LazyLoader("xgr", globals(), "xgrammar") diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 241869e35c620..a63a2d022378e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -28,7 +28,7 @@ from vllm.v1.worker.worker_base import WorkerBase logger = init_logger(__name__) if TYPE_CHECKING: - from vllm.v1.core.scheduler_output import SchedulerOutput + from vllm.v1.core.sched.output import SchedulerOutput class Worker(WorkerBase): diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index b7924752aec8d..ec3dcbc064cba 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -37,7 +37,7 @@ from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch if TYPE_CHECKING: - from vllm.v1.core.scheduler import SchedulerOutput + from vllm.v1.core.sched.output import SchedulerOutput logger = init_logger(__name__) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 9f59561192753..dbb231950d08d 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -17,7 +17,7 @@ from vllm.distributed import (ensure_model_parallel_initialized, from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from vllm.v1.core.scheduler import SchedulerOutput +from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import ModelRunnerOutput From 0cfe7d386d66623eca6920c65f12c3179d346c81 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Thu, 20 Mar 2025 21:28:53 -0400 Subject: [PATCH 03/40] [CI/Build] LoRA : make add_lora_test safer (#15181) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- tests/lora/test_add_lora.py | 48 +++++++------------------------------ 1 file changed, 8 insertions(+), 40 deletions(-) diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index 644a075b6dddc..c8b7a5cbf7470 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -1,10 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio import time -from pathlib import Path import pytest -from huggingface_hub import snapshot_download import vllm.envs as env from vllm.engine.arg_utils import AsyncEngineArgs @@ -13,35 +11,9 @@ from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams from vllm.utils import merge_async_iterators -MODEL_PATH = "meta-llama/Llama-2-7b-hf" -LORA_MODULE_DOWNLOAD_PATH = None # Populated by download_and_prepare_lora_module() #noqa -LORA_RANK = 8 -DEFAULT_MAX_LORAS = 16 * 3 - - -def download_and_prepare_lora_module(): - """ - Request submission is expensive when the LoRA adapters have their own - tokenizers. This is because, for each request with a new LoRA adapter ID, - the front-end loads the tokenizer from disk. - - In this test, as we are comparing request processing times, we want to - minimize any extra activity. To this effect, we download the LoRA - adapter and remove all the tokenizer files, so the engine will default - to the base model tokenizer. - """ - global LORA_MODULE_DOWNLOAD_PATH - - LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test" - LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH) - - tokenizer_files = [ - 'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json', - 'tokenizer.model' - ] - for tokenizer_file in tokenizer_files: - del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file - del_path.unlink(missing_ok=True) +MODEL_PATH = "THUDM/chatglm3-6b" +LORA_RANK = 64 +DEFAULT_MAX_LORAS = 4 * 3 @pytest.fixture(autouse=True) @@ -52,11 +24,9 @@ def v1(run_with_both_engines_lora): pass -def get_lora_requests() -> list[LoRARequest]: +def get_lora_requests(lora_path) -> list[LoRARequest]: lora_requests: list[LoRARequest] = [ - LoRARequest(lora_name=f"{i}", - lora_int_id=i, - lora_path=LORA_MODULE_DOWNLOAD_PATH) + LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path) for i in range(1, DEFAULT_MAX_LORAS + 1) ] return lora_requests @@ -93,7 +63,7 @@ async def requests_processing_time(llm, @pytest.mark.asyncio -async def test_add_lora(): +async def test_add_lora(chatglm3_lora_files): """ The add_lora function is used to pre-load some LoRA adapters into the engine in anticipation of future requests using these adapters. To test @@ -103,10 +73,7 @@ async def test_add_lora(): We measure the request processing time in both cases and expect the time to be lesser in the case with add_lora() calls. """ - - download_and_prepare_lora_module() - - lora_requests: list[LoRARequest] = get_lora_requests() + lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files) max_loras = len(set([lr.lora_int_id for lr in lora_requests])) # Create engine in eager-mode. Due to high max_loras, the CI can @@ -118,6 +85,7 @@ async def test_add_lora(): max_lora_rank=LORA_RANK, max_model_len=128, gpu_memory_utilization=0.8, #avoid OOM + trust_remote_code=True, enforce_eager=True) # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1` From d3ccbd6350bf8d601686e28fd059d1e6f53fcb4a Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Thu, 20 Mar 2025 19:01:11 -0700 Subject: [PATCH 04/40] Fix CUDA kernel index data type in vllm/csrc/quantization/fused_kernels/layernorm_utils.cuh +10 (#15159) Signed-off-by: Lu Fang Co-authored-by: Richard Barnes --- .../fused_kernels/layernorm_utils.cuh | 12 +- csrc/quantization/gguf/dequantize.cuh | 50 ++++----- csrc/quantization/gguf/gguf_kernel.cu | 4 +- csrc/quantization/gguf/mmq.cuh | 12 +- csrc/quantization/gguf/mmvq.cuh | 4 +- csrc/quantization/gguf/moe.cuh | 10 +- csrc/quantization/gptq/q_gemm.cu | 106 +++++++++--------- .../gptq_allspark/allspark_qgemm_w8a16.cu | 38 +++---- .../gptq_allspark/allspark_repack.cu | 8 +- .../gptq_allspark/allspark_utils.cuh | 4 +- 10 files changed, 124 insertions(+), 124 deletions(-) diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh index cec6b54edb569..b5cea98f7706e 100644 --- a/csrc/quantization/fused_kernels/layernorm_utils.cuh +++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh @@ -24,7 +24,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, // sum of squares float ss = 0.0f; - for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) { + for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) { float x = static_cast(input[token_offset + i]); if constexpr (has_residual) { x += static_cast(residual[token_offset + i]); @@ -58,7 +58,7 @@ __device__ void compute_dynamic_per_token_scales( constexpr scalar_out_t qmax{std::numeric_limits::max()}; float block_absmax_val_maybe = 0.0f; - for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) { + for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) { float x = static_cast(input[token_offset + i]); if constexpr (has_residual) { x += static_cast(residual[token_offset + i]); @@ -103,7 +103,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output, int64_t const token_offset = blockIdx.x * static_cast(hidden_size); ; - for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) { + for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) { float x = static_cast(input[token_offset + i]); if constexpr (has_residual) { x += static_cast(residual[token_offset + i]); @@ -142,7 +142,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, int32_t const num_vec_elems = hidden_size >> 2; #pragma unroll 4 - for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) { + for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) { vec4_t in = vec_input[i]; vec4_t x; @@ -206,7 +206,7 @@ __device__ void compute_dynamic_per_token_scales( float block_absmax_val_maybe = 0.0f; #pragma unroll 4 - for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) { + for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) { vec4_t in = vec_input[i]; vec4_t const w = vec_weight[i]; @@ -286,7 +286,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output, // TODO(luka/varun) extract into type-agnostic vectorized quant function to // replace scaled_fp8_conversion_vec #pragma unroll 4 - for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) { + for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) { vec4_t const in = vec_input[i]; vec4_t const w = vec_weight[i]; diff --git a/csrc/quantization/gguf/dequantize.cuh b/csrc/quantization/gguf/dequantize.cuh index c012262e49015..41fc032ff1a56 100644 --- a/csrc/quantization/gguf/dequantize.cuh +++ b/csrc/quantization/gguf/dequantize.cuh @@ -101,10 +101,10 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ template static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const int i = blockIdx.x; + const auto i = blockIdx.x; const block_q2_K * x = (const block_q2_K *) vx; - const int tid = threadIdx.x; + const auto tid = threadIdx.x; const int n = tid/32; const int l = tid - 32*n; const int is = 8*n + l/16; @@ -123,10 +123,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t template static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const int i = blockIdx.x; + const auto i = blockIdx.x; const block_q3_K * x = (const block_q3_K *) vx; - const int r = threadIdx.x/4; + const auto r = threadIdx.x/4; const int tid = r/2; const int is0 = r%2; const int l0 = 16*is0 + 4*(threadIdx.x%4); @@ -164,10 +164,10 @@ template static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const block_q4_K * x = (const block_q4_K *) vx; - const int i = blockIdx.x; + const auto i = blockIdx.x; // assume 32 threads - const int tid = threadIdx.x; + const auto tid = threadIdx.x; const int il = tid/8; const int ir = tid%8; const int is = 2*il; @@ -197,10 +197,10 @@ template static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const block_q5_K * x = (const block_q5_K *) vx; - const int i = blockIdx.x; + const auto i = blockIdx.x; // assume 64 threads - this is very slightly better than the one below - const int tid = threadIdx.x; + const auto tid = threadIdx.x; const int il = tid/16; // il is in 0...3 const int ir = tid%16; // ir is in 0...15 const int is = 2*il; // is is in 0...6 @@ -231,10 +231,10 @@ template static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { const block_q6_K * x = (const block_q6_K *) vx; - const int i = blockIdx.x; + const auto i = blockIdx.x; // assume 64 threads - this is very slightly better than the one below - const int tid = threadIdx.x; + const auto tid = threadIdx.x; const int ip = tid/32; // ip is 0 or 1 const int il = tid - 32*ip; // 0...32 const int is = 8*ip + il/16; @@ -256,10 +256,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t template static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const int i = blockIdx.x; + const auto i = blockIdx.x; const block_iq2_xxs * x = (const block_iq2_xxs *) vx; - const int tid = threadIdx.x; + const auto tid = threadIdx.x; const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -275,10 +275,10 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds template static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const int i = blockIdx.x; + const auto i = blockIdx.x; const block_iq2_xs * x = (const block_iq2_xs *) vx; - const int tid = threadIdx.x; + const auto tid = threadIdx.x; const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -293,10 +293,10 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst template static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const int i = blockIdx.x; + const auto i = blockIdx.x; const block_iq2_s * x = (const block_iq2_s *) vx; - const int tid = threadIdx.x; + const auto tid = threadIdx.x; const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -309,10 +309,10 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_ template static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const int i = blockIdx.x; + const auto i = blockIdx.x; const block_iq3_xxs * x = (const block_iq3_xxs *) vx; - const int tid = threadIdx.x; + const auto tid = threadIdx.x; const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -332,10 +332,10 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds template static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const int i = blockIdx.x; + const auto i = blockIdx.x; const block_iq3_s * x = (const block_iq3_s *) vx; - const int tid = threadIdx.x; + const auto tid = threadIdx.x; const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; @@ -399,10 +399,10 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_ template static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const int i = blockIdx.x; + const auto i = blockIdx.x; const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); - const int tid = threadIdx.x; + const auto tid = threadIdx.x; const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 4*il; @@ -417,10 +417,10 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst template static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const int i = blockIdx.x; + const auto i = blockIdx.x; const block_iq4_xs * x = (const block_iq4_xs *)vx; - const int tid = threadIdx.x; + const auto tid = threadIdx.x; const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 4*il; @@ -565,4 +565,4 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) { default: return nullptr; } -} \ No newline at end of file +} diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu index 46b716bbd98d7..b0f31c45e7311 100644 --- a/csrc/quantization/gguf/gguf_kernel.cu +++ b/csrc/quantization/gguf/gguf_kernel.cu @@ -19,11 +19,11 @@ template static __global__ void quantize_q8_1(const scalar_t* __restrict__ x, void* __restrict__ vy, const int kx, const int kx_padded) { - const int ix = blockDim.x * blockIdx.x + threadIdx.x; + const auto ix = blockDim.x * blockIdx.x + threadIdx.x; if (ix >= kx_padded) { return; } - const int iy = blockDim.y * blockIdx.y + threadIdx.y; + const auto iy = blockDim.y * blockIdx.y + threadIdx.y; const int i_padded = iy * kx_padded + ix; block_q8_1* y = (block_q8_1*)vy; diff --git a/csrc/quantization/gguf/mmq.cuh b/csrc/quantization/gguf/mmq.cuh index e2b93680ffb5a..7c89918c23d8a 100644 --- a/csrc/quantization/gguf/mmq.cuh +++ b/csrc/quantization/gguf/mmq.cuh @@ -14,10 +14,10 @@ static __device__ __forceinline__ void mul_mat_q( const int & ncols_dst = ncols_y; - const int row_dst_0 = blockIdx.x*mmq_y; + const auto row_dst_0 = blockIdx.x*mmq_y; const int & row_x_0 = row_dst_0; - const int col_dst_0 = blockIdx.y*mmq_x; + const auto col_dst_0 = blockIdx.y*mmq_x; const int & col_y_0 = col_dst_0; int * tile_x_ql = nullptr; @@ -39,7 +39,7 @@ static __device__ __forceinline__ void mul_mat_q( #pragma unroll for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) { - const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x; + const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x; const int kbxd = kqs / QI8_1; #pragma unroll @@ -53,7 +53,7 @@ static __device__ __forceinline__ void mul_mat_q( #pragma unroll for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x; - const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1); + const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1); const int col_y_eff = min(col_y_0 + ids, ncols_y-1); // if the sum is not needed it's faster to transform the scale to f32 ahead of time @@ -87,14 +87,14 @@ static __device__ __forceinline__ void mul_mat_q( #pragma unroll for (int j = 0; j < mmq_x; j += nwarps) { - const int col_dst = col_dst_0 + j + threadIdx.y; + const auto col_dst = col_dst_0 + j + threadIdx.y; if (col_dst >= ncols_dst) { return; } #pragma unroll for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { - const int row_dst = row_dst_0 + threadIdx.x + i; + const auto row_dst = row_dst_0 + threadIdx.x + i; if (row_dst >= nrows_dst) { continue; } diff --git a/csrc/quantization/gguf/mmvq.cuh b/csrc/quantization/gguf/mmvq.cuh index d83f297455548..687cb0a374105 100644 --- a/csrc/quantization/gguf/mmvq.cuh +++ b/csrc/quantization/gguf/mmvq.cuh @@ -1,7 +1,7 @@ // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu template static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) { - const int row = blockIdx.x*blockDim.y + threadIdx.y; + const auto row = blockIdx.x*blockDim.y + threadIdx.y; if (row >= nrows) { return; @@ -16,7 +16,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * const block_q_t * x = (const block_q_t *) vx; const block_q8_1 * y = (const block_q8_1 *) vy; - for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) { + for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) { const int ibx = row*blocks_per_row + i; // x block index const int iby = i * (qk/QK8_1); // y block index that aligns with ibx diff --git a/csrc/quantization/gguf/moe.cuh b/csrc/quantization/gguf/moe.cuh index e499f53a2acdf..2dbafc0f74225 100644 --- a/csrc/quantization/gguf/moe.cuh +++ b/csrc/quantization/gguf/moe.cuh @@ -19,10 +19,10 @@ static __device__ __forceinline__ void moe_q( const int ncols_dst = ncols_y * top_k; - const int row_dst_0 = blockIdx.x * mmq_y; + const auto row_dst_0 = blockIdx.x * mmq_y; const int& row_x_0 = row_dst_0; - const int col_dst_0 = blockIdx.y * mmq_x; + const auto col_dst_0 = blockIdx.y * mmq_x; int token_offs[mmq_x / nwarps]; for (int i = 0; i < mmq_x; i += nwarps) { @@ -56,7 +56,7 @@ static __device__ __forceinline__ void moe_q( const int n_per_r = ((qk * blocks_per_warp) / qr); #pragma unroll for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) { - const int kqs = ir * WARP_SIZE_GGUF + threadIdx.x; + const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x; const int kbxd = kqs / QI8_1; #pragma unroll @@ -73,7 +73,7 @@ static __device__ __forceinline__ void moe_q( } if (threadIdx.x < n_per_r / QK8_1) { - const int kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1); + const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1); const int col_y_eff = token_offs[threadIdx.y] / top_k; const int block_x = ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby; @@ -119,7 +119,7 @@ static __device__ __forceinline__ void moe_q( #pragma unroll for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { - const int row_dst = row_dst_0 + threadIdx.x + i; + const auto row_dst = row_dst_0 + threadIdx.x + i; if (row_dst >= nrows_dst) { continue; } diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu index 538cb5848e21f..6fad16e196bbc 100644 --- a/csrc/quantization/gptq/q_gemm.cu +++ b/csrc/quantization/gptq/q_gemm.cu @@ -199,12 +199,12 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel( MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - int t = threadIdx.x; + auto t = threadIdx.x; // Block - int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; - int offset_m = blockIdx.y * m_count; - int offset_k = blockIdx.z * BLOCK_KN_SIZE; + auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; + auto offset_m = blockIdx.y * m_count; + auto offset_k = blockIdx.z * BLOCK_KN_SIZE; [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); @@ -337,12 +337,12 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel( MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - int t = threadIdx.x; + auto t = threadIdx.x; // Block - int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; - int offset_m = blockIdx.y * m_count; - int offset_k = blockIdx.z * BLOCK_KN_SIZE; + auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; + auto offset_m = blockIdx.y * m_count; + auto offset_k = blockIdx.z * BLOCK_KN_SIZE; [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); @@ -458,12 +458,12 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel( MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - int t = threadIdx.x; + auto t = threadIdx.x; // Block - int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; - int offset_m = blockIdx.y * m_count; - int offset_k = blockIdx.z * BLOCK_KN_SIZE; + auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; + auto offset_m = blockIdx.y * m_count; + auto offset_k = blockIdx.z * BLOCK_KN_SIZE; [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); @@ -586,12 +586,12 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel( MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - int t = threadIdx.x; + auto t = threadIdx.x; // Block - int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; - int offset_m = blockIdx.y * m_count; - int offset_k = blockIdx.z * BLOCK_KN_SIZE; + auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; + auto offset_m = blockIdx.y * m_count; + auto offset_k = blockIdx.z * BLOCK_KN_SIZE; [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); [[maybe_unused]] int end_m = min(offset_m + m_count, size_m); @@ -765,14 +765,14 @@ __global__ void reconstruct_exllama_8bit_kernel( MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - int offset_k = BLOCK_KN_SIZE * blockIdx.y; - int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; + auto offset_k = BLOCK_KN_SIZE * blockIdx.y; + auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); // Preload remapping table __shared__ int perm[BLOCK_KN_SIZE]; - int t = threadIdx.x; + auto t = threadIdx.x; if (b_q_perm) { if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t]; @@ -862,14 +862,14 @@ __global__ void reconstruct_exllama_4bit_kernel( MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - int offset_k = BLOCK_KN_SIZE * blockIdx.y; - int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; + auto offset_k = BLOCK_KN_SIZE * blockIdx.y; + auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); // Preload remapping table __shared__ int perm[BLOCK_KN_SIZE]; - int t = threadIdx.x; + auto t = threadIdx.x; if (b_q_perm) { if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t]; @@ -967,14 +967,14 @@ __global__ void reconstruct_exllama_3bit_kernel( MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - int offset_k = BLOCK_KN_SIZE * blockIdx.y; - int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; + auto offset_k = BLOCK_KN_SIZE * blockIdx.y; + auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); // Preload remapping table __shared__ int perm[BLOCK_KN_SIZE]; - int t = threadIdx.x; + auto t = threadIdx.x; if (b_q_perm) { if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t]; @@ -1065,14 +1065,14 @@ __global__ void reconstruct_exllama_2bit_kernel( MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - int offset_k = BLOCK_KN_SIZE * blockIdx.y; - int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; + auto offset_k = BLOCK_KN_SIZE * blockIdx.y; + auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); // Preload remapping table __shared__ int perm[BLOCK_KN_SIZE]; - int t = threadIdx.x; + auto t = threadIdx.x; if (b_q_perm) { if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t]; @@ -1181,11 +1181,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel( int zero_width = width / 8; int vec_height = height * 4; const int blockwidth2 = BLOCK_KN_SIZE / 2; - int b = blockIdx.y * BLOCK_M_SIZE_MAX; + auto b = blockIdx.y * BLOCK_M_SIZE_MAX; int b_end = min(BLOCK_M_SIZE_MAX, batch - b); - int h = BLOCK_KN_SIZE * blockIdx.z / 8; + auto h = BLOCK_KN_SIZE * blockIdx.z / 8; int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4; - int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; + auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2]; if (threadIdx.x < h_end) { @@ -1197,8 +1197,8 @@ __global__ void gemm_half_q_half_alt_4bit_kernel( } __shared__ half2 deq2[256][8]; - int val = threadIdx.x / 8; - int off = threadIdx.x % 8; + auto val = threadIdx.x / 8; + auto off = threadIdx.x % 8; for (; val < 256; val += BLOCK_KN_SIZE / 8) { deq2[val][off] = __halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4)); @@ -1280,11 +1280,11 @@ __global__ void gemm_half_q_half_alt_8bit_kernel( int zero_width = width / 4; int vec_height = height * 2; const int blockwidth2 = BLOCK_KN_SIZE / 2; - int b = blockIdx.y * BLOCK_M_SIZE_MAX; + auto b = blockIdx.y * BLOCK_M_SIZE_MAX; int b_end = min(BLOCK_M_SIZE_MAX, batch - b); - int h = BLOCK_KN_SIZE * blockIdx.z / 4; + auto h = BLOCK_KN_SIZE * blockIdx.z / 4; int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2; - int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; + auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2]; if (threadIdx.x < h_end) { @@ -1393,8 +1393,8 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w, half* __restrict__ out) { // Start of block - int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; - int row = blockIdx.y * 32 / bit; + auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; + auto row = blockIdx.y * 32 / bit; if (column >= width) return; // Views @@ -1425,8 +1425,8 @@ __global__ void reconstruct_gptq_3bit_kernel( const int height, const int width, const int group, half* __restrict__ out) { // Start of block - int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; - int row = blockIdx.y * 32; + auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; + auto row = blockIdx.y * 32; if (column >= width) return; // Views @@ -1542,7 +1542,7 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a, __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight, const int size_k, const int size_n) { - int n = blockIdx.x * THREADS_X + threadIdx.x; + auto n = blockIdx.x * THREADS_X + threadIdx.x; if (n >= size_n) return; int k = 0; uint32_t* b_ptr = b_q_weight + n; @@ -1555,7 +1555,7 @@ __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight, __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight, const int size_k, const int size_n) { - int n = blockIdx.x * THREADS_X + threadIdx.x; + auto n = blockIdx.x * THREADS_X + threadIdx.x; if (n >= size_n) return; int k = 0; uint32_t* b_ptr = b_q_weight + n; @@ -1568,7 +1568,7 @@ __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight, __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight, const int size_k, const int size_n) { - int n = blockIdx.x * THREADS_X + threadIdx.x; + auto n = blockIdx.x * THREADS_X + threadIdx.x; if (n >= size_n) return; int k = 0; uint32_t* b_ptr = b_q_weight + n; @@ -1581,7 +1581,7 @@ __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight, __global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight, const int size_k, const int size_n) { - int n = blockIdx.x * THREADS_X + threadIdx.x; + auto n = blockIdx.x * THREADS_X + threadIdx.x; if (n >= size_n) return; int k = 0; uint32_t* b_ptr = b_q_weight + n; @@ -1599,9 +1599,9 @@ __global__ void make_sequential_4bit_kernel(const uint32_t* __restrict__ w, const uint64_t* w2 = (uint64_t*)w; uint64_t* w_new2 = (uint64_t*)w_new; int w2_stride = w_width >> 1; - int w2_column = THREADS_X * blockIdx.x + threadIdx.x; + auto w2_column = THREADS_X * blockIdx.x + threadIdx.x; if (w2_column >= w2_stride) return; - int w_new2_row = blockIdx.y; + auto w_new2_row = blockIdx.y; int q_perm_idx = w_new2_row << 3; uint64_t dst = 0; @@ -1630,9 +1630,9 @@ __global__ void make_sequential_2bit_kernel(const uint32_t* __restrict__ w, const uint64_t* w2 = (uint64_t*)w; uint64_t* w_new2 = (uint64_t*)w_new; int w2_stride = w_width >> 1; - int w2_column = THREADS_X * blockIdx.x + threadIdx.x; + auto w2_column = THREADS_X * blockIdx.x + threadIdx.x; if (w2_column >= w2_stride) return; - int w_new2_row = blockIdx.y; + auto w_new2_row = blockIdx.y; int q_perm_idx = w_new2_row << 4; uint64_t dst = 0; @@ -1658,10 +1658,10 @@ __global__ void make_sequential_3bit_kernel(const uint32_t* __restrict__ w, uint32_t* __restrict__ w_new, const int* __restrict__ q_perm, const int w_width) { - int w_column = THREADS_X * blockIdx.x + threadIdx.x; + auto w_column = THREADS_X * blockIdx.x + threadIdx.x; if (w_column >= w_width) return; - int w_new_row = blockIdx.y * 3; - int q_perm_idx = blockIdx.y << 5; + auto w_new_row = blockIdx.y * 3; + auto q_perm_idx = blockIdx.y << 5; uint32_t dst[3] = {0, 0, 0}; #pragma unroll @@ -1744,9 +1744,9 @@ __global__ void make_sequential_8bit_kernel(const uint32_t* __restrict__ w, const uint64_t* w2 = (uint64_t*)w; uint64_t* w_new2 = (uint64_t*)w_new; int w2_stride = w_width >> 1; - int w2_column = THREADS_X * blockIdx.x + threadIdx.x; + auto w2_column = THREADS_X * blockIdx.x + threadIdx.x; if (w2_column >= w2_stride) return; - int w_new2_row = blockIdx.y; + auto w_new2_row = blockIdx.y; int q_perm_idx = w_new2_row << 2; uint64_t dst = 0; diff --git a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu index b520f8c32b95b..ec0bf2c3cb4bd 100644 --- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu +++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu @@ -55,11 +55,11 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K + blockIdx.z * params.SplitK * 4; - const int lane_id = threadIdx.x % WARP_SIZE; + const auto lane_id = threadIdx.x % WARP_SIZE; // For matrix A, a block load/store Mtile(row) x 32(col) elements in // multiple iters, 8x4 warp load/store 8(row) x 32(col) elements per iter - const int Aldg_row_base_idx = threadIdx.x / 4; + const auto Aldg_row_base_idx = threadIdx.x / 4; Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A; const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx; @@ -67,7 +67,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { // elements of N32K16 packing in multiple iters, 4x8 warp load/store 4(row) // * 128(col) per iter Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B; - const int Bldg_row_base_idx = threadIdx.x / 8; + const auto Bldg_row_base_idx = threadIdx.x / 8; const int Bldg_base_offset = Bldg_row_base_idx * params.K * 4 + Bldg_col_idx; @@ -89,7 +89,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { B_ldg_guard = 0; #pragma unroll for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) { - int m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD; + auto m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD; if (m_idx < params.M) { A_ldg_guard |= (1u << i); } @@ -98,8 +98,8 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { const int N_padded = (params.N + 31) / 32 * 32; #pragma unroll for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) { - int n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 + - i * N_SIZE_ONE_LOAD; + auto n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 + + i * N_SIZE_ONE_LOAD; if (n_idx < N_padded) { B_ldg_guard |= (1u << i); } @@ -355,7 +355,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { __device__ void fused_splitk_reduce() { // need splitk-reduce if enable splitk if (gridDim.z > 1) { - int blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y; + auto blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y; // Wait for all previous blocks in the splitk direction to accumulate the // results into C_tmp if (threadIdx.x == 0) { @@ -371,7 +371,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { } __syncthreads(); - int C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4; + auto C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4; if (blockIdx.z != 0) { // expecting that temporary register here reuses the previous A&B frag // register @@ -456,7 +456,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { FType* C_base_ptr = this_block_C_base_ptr + store_c_base_offset; // C_tile lds and stg - int m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile; + auto m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile; bool n_guard = (store_c_col_idx + blockIdx.y * Ntile) < params.N; if (WARP_NTILE == 32) { int lds_c_base_offset = warp_id * Mtile * WARP_NTILE + @@ -580,9 +580,9 @@ __global__ void __launch_bounds__(BLOCK) int sts_stage_idx = 0; int lds_stage_idx = 0; - int tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K - ? params.SplitK - : params.K - blockIdx.z * params.SplitK; + auto tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K + ? params.SplitK + : params.K - blockIdx.z * params.SplitK; int k_tiles = (tb_k_slice + 31) / 32; int first_k_tile = tb_k_slice - (k_tiles - 1) * 32; @@ -777,13 +777,13 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel( const QT* qdata, const FT* scales, const FT* zeros, FT* fdata, const int N_32align, const int N, const int K) { __shared__ FT smem[64 * 32]; - int warp_id = threadIdx.x / 32; - int lane_id = threadIdx.x % 32; - const int src_row_idx = blockIdx.x * 8 + lane_id / 4; + auto warp_id = threadIdx.x / 32; + auto lane_id = threadIdx.x % 32; + const auto src_row_idx = blockIdx.x * 8 + lane_id / 4; const int src_col_idx = blockIdx.y * 64 * 4 + warp_id * 16 * 4 + (lane_id % 4) * 16; const int src_offset = src_row_idx * K * 4 + src_col_idx; - int params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4; + auto params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4; QT qval_reg[16]; const QT* pdata = qdata + src_offset; @@ -829,8 +829,8 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel( *reinterpret_cast(smem + lds_base_offset + i * 32 * 32); } - const int dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4; - const int dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8; + const auto dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4; + const auto dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8; #pragma unroll for (int i = 0; i < 2; ++i) { int dst_row_kidx = dst_row_base_kidx + i * 32; @@ -1008,4 +1008,4 @@ torch::Tensor allspark_w8a16_gemm( TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("allspark_w8a16_gemm", &allspark_w8a16_gemm); -} \ No newline at end of file +} diff --git a/csrc/quantization/gptq_allspark/allspark_repack.cu b/csrc/quantization/gptq_allspark/allspark_repack.cu index 82929c94ad8b6..ea8eccf040df6 100644 --- a/csrc/quantization/gptq_allspark/allspark_repack.cu +++ b/csrc/quantization/gptq_allspark/allspark_repack.cu @@ -13,8 +13,8 @@ __global__ void __launch_bounds__(128) const uint8_t* B, const FType* B_scale, const FType* B_zero, uint8_t* B_result, FType* B_scale_result, FType* B_zero_result, const int K, const int N, const int N_32align) { - const int lane_id = threadIdx.x % 32; - const int warp_id = threadIdx.x / 32; + const auto lane_id = threadIdx.x % 32; + const auto warp_id = threadIdx.x / 32; if (blockIdx.x != gridDim.x - 1) { // Load B @@ -50,7 +50,7 @@ __global__ void __launch_bounds__(128) } // Store B - const int dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8; + const auto dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8; const int dst_col_idx = blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8; for (int i = 0; i < 8; ++i) { @@ -65,7 +65,7 @@ __global__ void __launch_bounds__(128) } else { // Load B_scale and B_zero FType b_scale_reg, b_zero_reg; - int src_offset = blockIdx.y * 128 + threadIdx.x; + auto src_offset = blockIdx.y * 128 + threadIdx.x; ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N); if (B_zero != nullptr) ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N); diff --git a/csrc/quantization/gptq_allspark/allspark_utils.cuh b/csrc/quantization/gptq_allspark/allspark_utils.cuh index 80456c25590d0..831413016538e 100644 --- a/csrc/quantization/gptq_allspark/allspark_utils.cuh +++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh @@ -62,7 +62,7 @@ template __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C, uint32_t n, uint32_t n_matrix, uint32_t matrix_size) { - int idx = blockIdx.x * BLOCK + threadIdx.x; + auto idx = blockIdx.x * BLOCK + threadIdx.x; if (idx >= matrix_size) { return; @@ -407,4 +407,4 @@ static __device__ half2 inline num2num2(const half x) { return __half2half2(x); } -} // namespace allspark \ No newline at end of file +} // namespace allspark From 10f55fe6c5a7836f173ab93446f8a298d9fb00f6 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 21 Mar 2025 10:17:12 +0800 Subject: [PATCH 05/40] [Misc] Clean up the BitsAndBytes arguments (#15140) Signed-off-by: Jee Jee Li --- docs/source/features/quantization/bnb.md | 6 +++--- .../lora_with_quantization_inference.py | 1 - vllm/engine/arg_utils.py | 15 ++++----------- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md index 7525e8e7866c3..b81d89c457513 100644 --- a/docs/source/features/quantization/bnb.md +++ b/docs/source/features/quantization/bnb.md @@ -25,7 +25,7 @@ import torch # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. model_id = "unsloth/tinyllama-bnb-4bit" llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ -quantization="bitsandbytes", load_format="bitsandbytes") +quantization="bitsandbytes") ``` ## Inflight quantization: load as 4bit quantization @@ -35,7 +35,7 @@ from vllm import LLM import torch model_id = "huggyllama/llama-7b" llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ -quantization="bitsandbytes", load_format="bitsandbytes") +quantization="bitsandbytes") ``` ## OpenAI Compatible Server @@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes") Append the following to your 4bit model arguments: ```console ---quantization bitsandbytes --load-format bitsandbytes +--quantization bitsandbytes ``` diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index a409735013f6d..ab235ddd75455 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str, engine_args = EngineArgs(model=model, quantization=quantization, qlora_adapter_name_or_path=lora_repo, - load_format="bitsandbytes", enable_lora=True, max_lora_rank=64) else: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bbe780a0ec118..88d70acb79d23 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1170,22 +1170,15 @@ class EngineArgs: ) def create_load_config(self) -> LoadConfig: - # bitsandbytes quantization needs a specific model loader - # so we make sure the quant method and the load format are consistent - if (self.quantization == "bitsandbytes" or - self.qlora_adapter_name_or_path is not None) and \ - self.load_format != "bitsandbytes": - raise ValueError( - "BitsAndBytes quantization and QLoRA adapter only support " - f"'bitsandbytes' load format, but got {self.load_format}") - if (self.load_format == "bitsandbytes" or - self.qlora_adapter_name_or_path is not None) and \ + if(self.qlora_adapter_name_or_path is not None) and \ self.quantization != "bitsandbytes": raise ValueError( - "BitsAndBytes load format and QLoRA adapter only support " + "QLoRA adapter only support " f"'bitsandbytes' quantization, but got {self.quantization}") + if self.quantization == "bitsandbytes": + self.load_format = "bitsandbytes" return LoadConfig( load_format=self.load_format, download_dir=self.download_dir, From 2e0b4cfde04585ef3acb9257d9f7b38776da926d Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 20 Mar 2025 19:17:33 -0700 Subject: [PATCH 06/40] [ROCM] Upgrade torch to 2.6 (#15244) Signed-off-by: Sage Moore --- requirements/rocm-build.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index a0731c51d46bd..6af78da4993db 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -1,10 +1,10 @@ # Common dependencies -r common.txt ---extra-index-url https://download.pytorch.org/whl/rocm6.2 -torch==2.5.1 -torchvision==0.20.1 -torchaudio==2.5.1 +--extra-index-url https://download.pytorch.org/whl/rocm6.2.4 +torch==2.6.0 +torchvision==0.21.0 +torchaudio==2.6.0 cmake>=3.26 packaging From 1e508343e1ec6c5466586a9bcb7b229f0d879c76 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 21 Mar 2025 10:18:04 +0800 Subject: [PATCH 07/40] [Bugfix] Fix incorrect qwen2.5-vl attention mask pre-computation (#15200) Signed-off-by: Isotr0py <2037008807@qq.com> --- .../vision_language/test_models.py | 13 +++++++++++++ .../vision_language/vlm_utils/custom_inputs.py | 18 ++++++++++++++++++ vllm/model_executor/models/qwen2_5_vl.py | 10 ++++++---- 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 5690249eb3754..0235140187990 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -508,6 +508,19 @@ VLM_TEST_SETTINGS = { limit_mm_per_prompt={"image": 4}, )], ), + # regression test for https://github.com/vllm-project/vllm/issues/15122 + "qwen2_5_vl-windows-attention": VLMTestInfo( + models=["Qwen/Qwen2.5-VL-3B-Instruct"], + test_type=VLMTestType.CUSTOM_INPUTS, + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, + custom_test_opts=[CustomTestOptions( + inputs=custom_inputs.windows_attention_image_qwen2_5_vl(), + limit_mm_per_prompt={"image": 1}, + )], + ), } # yapf: enable diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py index 2f03a114ae531..235618ae547ea 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py @@ -1,7 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 """Custom input builders for edge-cases in different models.""" +from io import BytesIO from typing import Callable +import requests +from PIL import Image + from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import (rescale_video_size, resize_video, sample_frames_from_video) @@ -102,3 +106,17 @@ def different_patch_input_cases_internvl(): build_single_image_inputs(images, formatted_sprompts, wrapped_sf), build_multi_image_inputs([images], formatted_mprompts, wrapped_sf), ] + + +def windows_attention_image_qwen2_5_vl(): + # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 + image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg" + image = Image.open(BytesIO(requests.get(image_url).content)) + + question = "Describe the image." + img_prompt = "<|vision_start|><|image_pad|><|vision_end|>" + prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + + wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5]) + return build_single_image_inputs([image], [prompt], wrapped_sf) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8a570d138c6c2..adca97c71c581 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -647,15 +647,17 @@ class Qwen2_5_VisionTransformer(nn.Module): max_seqlen = None seqlens = None - if self.attn_backend == _Backend.FLASH_ATTN: - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() - elif self.attn_backend == _Backend.XFORMERS: - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() for layer_num, blk in enumerate(self.blocks): if layer_num in self.fullatt_block_indexes: cu_seqlens_now = cu_seqlens else: cu_seqlens_now = cu_window_seqlens + # pre-compute cu_seqlens for window attn + if self.attn_backend == _Backend.FLASH_ATTN: + max_seqlen = (cu_seqlens_now[1:] - + cu_seqlens_now[:-1]).max().item() + elif self.attn_backend == _Backend.XFORMERS: + seqlens = (cu_seqlens_now[1:] - cu_seqlens_now[:-1]).tolist() hidden_states = blk( hidden_states, cu_seqlens=cu_seqlens_now, From 6edbfa924df0d1809028296d02dd328bd8139529 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 21 Mar 2025 02:18:36 +0000 Subject: [PATCH 08/40] Mention `extra_body` as a way top pass vLLM only parameters using the OpenAI client (#15240) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/source/serving/openai_compatible_server.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 0880a4530d8c2..a6ec05f45b69b 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -29,6 +29,11 @@ completion = client.chat.completions.create( print(completion.choices[0].message) ``` +:::{tip} +vLLM supports some parameters that are not supported by OpenAI, `top_k` for example. +You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`. +::: + ## Supported APIs We currently support the following OpenAI APIs: From 47195057e9f2b93b35579e2ffd26e2e15cd49979 Mon Sep 17 00:00:00 2001 From: Hyesoo Yang <45211235+hyeygit@users.noreply.github.com> Date: Thu, 20 Mar 2025 19:19:40 -0700 Subject: [PATCH 09/40] [V1][TPU] Speed up top-k on TPU by using torch.topk (#15242) Signed-off-by: Hyesoo Yang --- tests/v1/tpu/test_sampler.py | 3 ++- vllm/envs.py | 6 ++++++ vllm/v1/sample/ops/topk_topp_sampler.py | 24 +++++++++++++++++++++--- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py index 76b8ddb92b782..4e5a57bee3275 100644 --- a/tests/v1/tpu/test_sampler.py +++ b/tests/v1/tpu/test_sampler.py @@ -39,7 +39,7 @@ def test_sampler_compilation(model_name: str, monkeypatch): sampling_params = SamplingParams( temperature=0.7, # top_p=0.6, # TODO too slow! - # top_k=10, + top_k=10, min_p=0.2, max_tokens=16) s = time() @@ -49,6 +49,7 @@ def test_sampler_compilation(model_name: str, monkeypatch): # Second request with different params, but for which we # compiled for in previous eager iteration. sampling_params = SamplingParams(temperature=0.1, + top_k=12, min_p=0.8, max_tokens=24) s = time() diff --git a/vllm/envs.py b/vllm/envs.py index 56bf862674768..d88ab3b5e7d06 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -95,6 +95,7 @@ if TYPE_CHECKING: VLLM_DP_MASTER_PORT: int = 0 VLLM_MARLIN_USE_ATOMIC_ADD: bool = False VLLM_V0_USE_OUTLINES_CACHE: bool = False + VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False def get_default_cache_root(): @@ -623,6 +624,11 @@ environment_variables: dict[str, Callable[[], Any]] = { # an environment with potentially malicious users. "VLLM_V0_USE_OUTLINES_CACHE": lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1", + + # If set, disables TPU-specific optimization for top-k & top-p sampling + "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION": + lambda: bool(int(os.environ["VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION"])) + if "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION" in os.environ else None, } # end-env-vars-definition diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index e1a3e92de4931..1dea711874bfd 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -66,7 +66,14 @@ class TopKTopPSampler(nn.Module): "best performance, please install FlashInfer.") self.forward = self.forward_native elif current_platform.is_tpu(): - self.forward = self.forward_tpu + if envs.VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: + logger.warning( + "TPU-specific optimization for top-k & top-p sampling are " + "disabled, falling back to PyTorch-native implementation " + "which could be very slow.") + self.forward = self.forward_native + else: + self.forward = self.forward_tpu else: self.forward = self.forward_native @@ -105,8 +112,19 @@ class TopKTopPSampler(nn.Module): k: Optional[torch.Tensor], p: Optional[torch.Tensor], ) -> torch.Tensor: - # TODO Placeholder for TPU optimized topk/p kernel - # logits = apply_top_k_top_p(logits, k, p) + # If only top-k is specified, use pytorch's builtin topk op. This leads + # to significant speed up on TPU compared to using apply_top_k_top_p. + if k is not None and p is None: + topk_values, topk_indices = torch.topk(logits, k, dim=-1) + + mask = torch.ones_like(logits, dtype=torch.bool) + mask.scatter_(-1, topk_indices, False) + logits.masked_fill_(mask, float('-inf')) + else: + # TODO Placeholder for TPU optimized topp kernel + # logits = apply_top_k_top_p(logits, k, p) + pass + probs = logits.softmax(dim=-1, dtype=torch.float32) return random_sample(probs, generators) From 0032903a5bb7c7c655f52f4efdfcc221947e9ca8 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Thu, 20 Mar 2025 20:20:16 -0600 Subject: [PATCH 10/40] [Bugfix] detect alibi and revert to FA2 (#15231) Signed-off-by: Travis Johnson --- vllm/attention/backends/flash_attn.py | 3 ++- vllm/fa_utils.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index e981ac780b007..4cb0b916739a0 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -630,7 +630,8 @@ class FlashAttentionImpl(AttentionImpl): self.sliding_window = ((sliding_window - 1, 0) if sliding_window is not None else (-1, -1)) self.kv_cache_dtype = kv_cache_dtype - self.vllm_flash_attn_version = get_flash_attn_version() + self.vllm_flash_attn_version = get_flash_attn_version( + requires_alibi=self.alibi_slopes is not None) if (is_quantized_kv_cache(self.kv_cache_dtype) and self.vllm_flash_attn_version != 3): raise NotImplementedError( diff --git a/vllm/fa_utils.py b/vllm/fa_utils.py index 028c96b839fb6..4176534901586 100644 --- a/vllm/fa_utils.py +++ b/vllm/fa_utils.py @@ -7,7 +7,7 @@ from vllm.logger import init_logger logger = init_logger(__name__) -def get_flash_attn_version() -> Optional[int]: +def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]: # import here to avoid circular dependencies from vllm.platforms import current_platform try: @@ -28,8 +28,14 @@ def get_flash_attn_version() -> Optional[int]: # 3. fallback for unsupported combinations if device_capability.major == 10 and fa_version == 3: - logger.warning("Cannot use FA version 3 on Blackwell platform", - "defaulting to FA version 2.") + logger.warning_once( + "Cannot use FA version 3 on Blackwell platform " + "defaulting to FA version 2.") + fa_version = 2 + + if requires_alibi and fa_version == 3: + logger.warning_once("Cannot use FA version 3 with ALiBi, " + "defaulting to FA version 2.") fa_version = 2 if not is_fa_version_supported(fa_version): From 296f927f2493908984707354e3cc5d7b2e41650b Mon Sep 17 00:00:00 2001 From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com> Date: Thu, 20 Mar 2025 19:21:08 -0700 Subject: [PATCH 11/40] [Model] RE: Mamba2 Prefill Performance Tweaks: Fixing Flurry of Unnecessary Memory Copies (#14857) Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com> --- vllm/model_executor/layers/mamba/mamba_mixer2.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index fec6d6112d665..d7a45bc51239a 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -470,10 +470,11 @@ class MambaMixer2(CustomOp): if has_prefill: initial_states = None - if has_initial_states is not None and any(has_initial_states): - for idx in mamba_cache_params.state_indices_tensor[ - ~has_initial_states]: - mamba_cache_params.ssm_state[idx].zero_() + if has_initial_states is not None and torch.any( + has_initial_states): + zero_init_indices = mamba_cache_params.state_indices_tensor[ + ~has_initial_states] + mamba_cache_params.ssm_state[zero_init_indices] = 0 initial_states = mamba_cache_params.ssm_state[ mamba_cache_params.state_indices_tensor] @@ -499,8 +500,8 @@ class MambaMixer2(CustomOp): # update ssm states # - varlen state is a (batch, nheads, headdim, dstate) tensor - for i, idx in enumerate(mamba_cache_params.state_indices_tensor): - mamba_cache_params.ssm_state[idx].copy_(varlen_state[i]) + mamba_cache_params.ssm_state[ + mamba_cache_params.state_indices_tensor] = varlen_state # - reshape hidden_states = scan_output.view(seq_len, -1) From 11b986b3fb8d59d43094297e9c83302c88622de0 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 20 Mar 2025 19:24:21 -0700 Subject: [PATCH 12/40] [Docs] Trim the latest news in README (#15261) Signed-off-by: Woosuk Kwon --- README.md | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/README.md b/README.md index 573b667ca88e3..d829b05714406 100644 --- a/README.md +++ b/README.md @@ -28,19 +28,7 @@ Easy, fast, and cheap LLM serving for everyone - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing). -- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! -- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing). -- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! -- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users! -- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). -- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). -- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html). -- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing). -- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing). -- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing). -- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing). -- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM. -- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai). +- [2024/12] vLLM joins [PyTorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! --- From 5df2da5b974d577cb98944c057de435dfd615cac Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Thu, 20 Mar 2025 19:27:46 -0700 Subject: [PATCH 13/40] [Misc] Better RayExecutor and multiprocessing compatibility (#14705) Signed-off-by: Cody Yu --- vllm/engine/arg_utils.py | 15 +++++++- vllm/executor/multiproc_worker_utils.py | 4 +-- vllm/executor/ray_utils.py | 21 ++++++----- vllm/utils.py | 48 +++++++++++++++++++------ 4 files changed, 67 insertions(+), 21 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 88d70acb79d23..986d1b407494f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -26,7 +26,7 @@ from vllm.plugins import load_general_plugins from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.utils import check_gguf_file from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, StoreBoolean +from vllm.utils import FlexibleArgumentParser, StoreBoolean, is_in_ray_actor if TYPE_CHECKING: from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup @@ -1245,6 +1245,18 @@ class EngineArgs: cpu_offload_gb=self.cpu_offload_gb, calculate_kv_scales=self.calculate_kv_scales, ) + + # Get the current placement group if Ray is initialized and + # we are in a Ray actor. If so, then the placement group will be + # passed to spawned processes. + placement_group = None + if is_in_ray_actor(): + import ray + + # This call initializes Ray automatically if it is not initialized, + # but we should not do this here. + placement_group = ray.util.get_current_placement_group() + parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, @@ -1257,6 +1269,7 @@ class EngineArgs: self.tokenizer_pool_extra_config, ), ray_workers_use_nsight=self.ray_workers_use_nsight, + placement_group=placement_group, distributed_executor_backend=self.distributed_executor_backend, worker_cls=self.worker_cls, worker_extension_cls=self.worker_extension_cls, diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index a4a5b3f938c5b..380b672c3605a 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -16,7 +16,7 @@ import torch from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import _check_multiproc_method, get_mp_context, run_method +from vllm.utils import _maybe_force_spawn, get_mp_context, run_method logger = init_logger(__name__) @@ -291,7 +291,7 @@ def set_multiprocessing_worker_envs(parallel_config): in a multiprocessing environment. This should be called by the parent process before worker processes are created""" - _check_multiproc_method() + _maybe_force_spawn() # Configure thread parallelism if OMP_NUM_THREADS isn't set # diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index a7042ca8df17c..b7222f26f663e 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -284,8 +284,9 @@ def initialize_ray_cluster( assert_ray_available() from vllm.platforms import current_platform - # Connect to a ray cluster. - if current_platform.is_rocm() or current_platform.is_xpu(): + if ray.is_initialized(): + logger.info("Ray is already initialized. Skipping Ray initialization.") + elif current_platform.is_rocm() or current_platform.is_xpu(): # Try to connect existing ray instance and create a new one if not found try: ray.init("auto", ignore_reinit_error=True) @@ -299,19 +300,21 @@ def initialize_ray_cluster( else: ray.init(address=ray_address, ignore_reinit_error=True) - if parallel_config.placement_group: - # Placement group is already set. - return - device_str = current_platform.ray_device_key if not device_str: raise ValueError( f"current platform {current_platform.device_name} does not " "support ray.") - # Create placement group for worker processes - current_placement_group = ray.util.get_current_placement_group() + # Create or get the placement group for worker processes + if parallel_config.placement_group: + current_placement_group = parallel_config.placement_group + else: + current_placement_group = ray.util.get_current_placement_group() + if current_placement_group: + logger.info("Using the existing placement group") + # We are in a placement group bundles = current_placement_group.bundle_specs # Verify that we can use the placement group. @@ -331,6 +334,8 @@ def initialize_ray_cluster( f"Required number of devices: {parallel_config.world_size}. " f"Total number of devices: {device_bundles}.") else: + logger.info("No current placement group found. " + "Creating a new placement group.") num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) # Log a warning message and delay resource allocation failure response. # Avoid immediate rejection to allow user-initiated placement group diff --git a/vllm/utils.py b/vllm/utils.py index 9bc081890bcb7..cb375f8ff3228 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -2147,20 +2147,48 @@ def zmq_socket_ctx(path: str, socket_type: Any) -> Iterator[zmq.Socket]: ctx.destroy(linger=0) -def _check_multiproc_method(): - if (cuda_is_initialized() - and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): - logger.warning("CUDA was previously initialized. We must use " - "the `spawn` multiprocessing start method. Setting " - "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " - "See https://docs.vllm.ai/en/latest/getting_started/" - "troubleshooting.html#python-multiprocessing " - "for more information.") +def is_in_ray_actor(): + """Check if we are in a Ray actor.""" + + try: + import ray + return (ray.is_initialized() + and ray.get_runtime_context().get_actor_id() is not None) + except ImportError: + return False + + +def _maybe_force_spawn(): + """Check if we need to force the use of the `spawn` multiprocessing start + method. + """ + if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn": + return + + reason = None + if cuda_is_initialized(): + reason = "CUDA is initialized" + elif is_in_ray_actor(): + reason = "In a Ray actor and can only be spawned" + + if reason is not None: + logger.warning( + "We must use the `spawn` multiprocessing start method. " + "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "See https://docs.vllm.ai/en/latest/getting_started/" + "troubleshooting.html#python-multiprocessing " + "for more information. Reason: %s", reason) os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" def get_mp_context(): - _check_multiproc_method() + """Get a multiprocessing context with a particular method (spawn or fork). + By default we follow the value of the VLLM_WORKER_MULTIPROC_METHOD to + determine the multiprocessing method (default is fork). However, under + certain conditions, we may enforce spawn and override the value of + VLLM_WORKER_MULTIPROC_METHOD. + """ + _maybe_force_spawn() mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD return multiprocessing.get_context(mp_method) From e588ac237ca52fc8c5c3e18b85972a8b3a0262db Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 20 Mar 2025 19:55:47 -0700 Subject: [PATCH 14/40] Add an example for reproducibility (#15262) Signed-off-by: Woosuk Kwon --- examples/offline_inference/reproduciblity.py | 36 ++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 examples/offline_inference/reproduciblity.py diff --git a/examples/offline_inference/reproduciblity.py b/examples/offline_inference/reproduciblity.py new file mode 100644 index 0000000000000..d0197bf6d5ba0 --- /dev/null +++ b/examples/offline_inference/reproduciblity.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: Apache-2.0 +import os + +from vllm import LLM, SamplingParams + +# vLLM does not guarantee the reproducibility of the results by default, +# for the sake of performance. You need to do the following to achieve +# reproducible results: +# 1. Turn off multiprocessing to make the scheduling deterministic. +# NOTE(woosuk): This is not needed and will be ignored for V0. +os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" +# 2. Fix the global seed for reproducibility. The default seed is None, which is +# not reproducible. +SEED = 42 + +# NOTE(woosuk): Even with the above two settings, vLLM only provides +# reproducibility when it runs on the same hardware and the same vLLM version. +# Also, the online serving API (`vllm serve`) does not support reproducibility +# because it is almost impossible to make the scheduling deterministic in the +# online serving setting. + +llm = LLM(model="facebook/opt-125m", seed=SEED) + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") From b15fd2be2acae28b25253840588f3eb135a3f709 Mon Sep 17 00:00:00 2001 From: Siyuan Liu Date: Thu, 20 Mar 2025 20:05:28 -0700 Subject: [PATCH 15/40] [Hardware][TPU] Add check for no additional graph compilation during runtime (#14710) Signed-off-by: Siyuan Liu --- .buildkite/run-tpu-v1-test.sh | 14 ++++++++------ vllm/envs.py | 5 +++++ vllm/v1/worker/tpu_model_runner.py | 19 +++++++++++++++++++ 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh index 82f40c650f8cf..6562942ea3f8c 100755 --- a/.buildkite/run-tpu-v1-test.sh +++ b/.buildkite/run-tpu-v1-test.sh @@ -19,17 +19,19 @@ docker run --privileged --net host --shm-size=16G -it \ vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ && python3 -m pip install pytest \ && python3 -m pip install lm_eval[api]==0.4.4 \ + && export VLLM_USE_V1=1 \ + && export VLLM_XLA_CHECK_RECOMPILATION=1 \ && echo TEST_1 \ - && VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \ + && python3 /workspace/vllm/tests/tpu/test_compilation.py \ && echo TEST_2 \ - && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \ + && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \ && echo TEST_3 \ - && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \ + && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \ && echo TEST_4 \ - && VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ + && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \ && echo TEST_5 \ - && VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \ - + && python3 /workspace/vllm/examples/offline_inference/tpu.py" \ + # TODO: This test fails because it uses RANDOM_SEED sampling # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ diff --git a/vllm/envs.py b/vllm/envs.py index d88ab3b5e7d06..d54de9da25315 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -45,6 +45,7 @@ if TYPE_CHECKING: VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") + VLLM_XLA_CHECK_RECOMPILATION: bool = False VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024 VLLM_USE_RAY_SPMD_WORKER: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False @@ -446,6 +447,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_XLA_CACHE_PATH", os.path.join(get_default_cache_root(), "vllm", "xla_cache"), )), + + # If set, assert on XLA recompilation after each execution step. + "VLLM_XLA_CHECK_RECOMPILATION": + lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))), "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")), diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index ec3dcbc064cba..d772a3ee13ec3 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -11,6 +11,7 @@ import torch.nn as nn import torch_xla.core.xla_model as xm import torch_xla.runtime as xr +import vllm.envs as envs from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.config import VllmConfig @@ -73,6 +74,10 @@ class TPUModelRunner: scheduler_config = self.scheduler_config parallel_config = self.parallel_config self.device = device + self.check_recompilation = envs.VLLM_XLA_CHECK_RECOMPILATION + if self.check_recompilation: + self.num_xla_graphs = xr.get_num_cached_compilation_graph() + self.enforce_eager = model_config.enforce_eager self.pin_memory = is_pin_memory_available() self.dtype = self.model_config.dtype @@ -671,6 +676,12 @@ class TPUModelRunner: logprobs=None, prompt_logprobs_dict=prompt_logprobs_dict, ) + # Check there is no new graph compilation, all the graphs should be + # captured and compiled during warming up. + if self.check_recompilation and not self.enforce_eager: + curr_cached_graph = xr.get_num_cached_compilation_graph() + assert self.num_xla_graphs == curr_cached_graph, ( + "Recompilation after warm up is detected.") return model_runner_output def load_model(self) -> None: @@ -810,6 +821,14 @@ class TPUModelRunner: xm.wait_device_ops() end = time.perf_counter() logger.info("Compilation finished in in %.2f [secs].", end - start) + # Record the number cached XLA graph after warming up, this will be + # used for checking there is no additional graph compilation during + # runtime execution. + if self.check_recompilation: + total_cached_graphs = xr.get_num_cached_compilation_graph() + num_compiled_graphs = total_cached_graphs - self.num_xla_graphs + logger.info("Compiled %d XLA graphs.", num_compiled_graphs) + self.num_xla_graphs += num_compiled_graphs def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: """ From f8a08cb90dc0b5b45663cd2605d0c98c77efe009 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 21 Mar 2025 11:14:19 +0800 Subject: [PATCH 16/40] [V1] Enable Triton(ROCm) Attention backend for Nvidia GPUs (#14071) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Woosuk Kwon --- vllm/engine/arg_utils.py | 2 +- vllm/platforms/cuda.py | 11 +++++++--- vllm/platforms/interface.py | 1 + vllm/platforms/rocm.py | 5 +++-- .../backends/{rocm_attn.py => triton_attn.py} | 20 +++++++++---------- 5 files changed, 23 insertions(+), 16 deletions(-) rename vllm/v1/attention/backends/{rocm_attn.py => triton_attn.py} (91%) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 986d1b407494f..edfa748b82d7b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1588,7 +1588,7 @@ class EngineArgs: # No FlashInfer or XFormers so far. V1_BACKENDS = [ "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1", - "TRITON_MLA", "FLASHMLA" + "TRITON_ATTN_VLLM_V1", "TRITON_MLA", "FLASHMLA" ] if (envs.is_set("VLLM_ATTENTION_BACKEND") and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS): diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index dd2a9cb6161e7..38d8fffd63c03 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -213,9 +213,14 @@ class CudaPlatformBase(Platform): return ("vllm.attention.backends." "flashmla.FlashMLABackend") if use_v1: - logger.info_once("Using Flash Attention backend on V1 engine.") - return ("vllm.v1.attention.backends.flash_attn." - "FlashAttentionBackend") + if selected_backend == _Backend.TRITON_ATTN_VLLM_V1: + logger.info_once("Using Triton backend on V1 engine.") + return ("vllm.v1.attention.backends." + "triton_attn.TritonAttentionBackend") + if cls.has_device_capability(80): + logger.info_once("Using Flash Attention backend on V1 engine.") + return ("vllm.v1.attention.backends." + "flash_attn.FlashAttentionBackend") if selected_backend == _Backend.FLASHINFER: logger.info("Using FlashInfer backend.") return "vllm.attention.backends.flashinfer.FlashInferBackend" diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index c7152d0bfb79e..d3bffaf4d69bf 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -29,6 +29,7 @@ def in_wsl() -> bool: class _Backend(enum.Enum): FLASH_ATTN = enum.auto() FLASH_ATTN_VLLM_V1 = enum.auto() + TRITON_ATTN_VLLM_V1 = enum.auto() XFORMERS = enum.auto() ROCM_FLASH = enum.auto() TORCH_SDPA = enum.auto() diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 75f287b568ac3..ee708f5961df0 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -120,8 +120,9 @@ class RocmPlatform(Platform): selected_backend = (_Backend.ROCM_FLASH if selected_backend == _Backend.FLASH_ATTN else selected_backend) if envs.VLLM_USE_V1: - logger.info("Using ROCm Attention backend on V1 engine.") - return "vllm.v1.attention.backends.rocm_attn.ROCmAttentionBackend" + logger.info("Using Triton Attention backend on V1 engine.") + return ("vllm.v1.attention.backends." + "triton_attn.TritonAttentionBackend") if selected_backend == _Backend.ROCM_FLASH: if not cls.has_device_capability(90): # not Instinct series GPUs. diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/triton_attn.py similarity index 91% rename from vllm/v1/attention/backends/rocm_attn.py rename to vllm/v1/attention/backends/triton_attn.py index 640c3b3d4fbb1..f11f2b6271ff8 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""Attention layer with PagedAttention on rocm""" +"""Attention layer with PagedAttention and Triton prefix prefill.""" from typing import Any, Optional import torch @@ -16,7 +16,7 @@ from vllm.v1.attention.backends.flash_attn import ( logger = init_logger(__name__) -class ROCmAttentionBackend(AttentionBackend): +class TritonAttentionBackend(AttentionBackend): accept_output_buffer: bool = True @@ -26,11 +26,11 @@ class ROCmAttentionBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "ROCM_ATTN_VLLM_V1" + return "TRITON_ATTN_VLLM_V1" @staticmethod - def get_impl_cls() -> type["ROCmAttentionImpl"]: - return ROCmAttentionImpl + def get_impl_cls() -> type["TritonAttentionImpl"]: + return TritonAttentionImpl @staticmethod def get_metadata_cls() -> type["AttentionMetadata"]: @@ -56,7 +56,7 @@ class ROCmAttentionBackend(AttentionBackend): return FlashAttentionMetadataBuilder -class ROCmAttentionImpl(AttentionImpl): +class TritonAttentionImpl(AttentionImpl): def __init__( self, @@ -73,7 +73,7 @@ class ROCmAttentionImpl(AttentionImpl): ) -> None: if blocksparse_params is not None: raise ValueError( - "ROCmAttention does not support block-sparse attention.") + "TritonAttention does not support block-sparse attention.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) @@ -90,17 +90,17 @@ class ROCmAttentionImpl(AttentionImpl): assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads - support_head_sizes = ROCmAttentionBackend.get_supported_head_sizes() + support_head_sizes = TritonAttentionBackend.get_supported_head_sizes() if head_size not in support_head_sizes: raise ValueError( - f"Head size {head_size} is not supported by ROCmAttention. " + f"Head size {head_size} is not supported by TritonAttention. " f"Supported head sizes are: {support_head_sizes}.") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " "encoder/decoder cross-attention " "are not implemented for " - "ROCmAttentionImpl") + "TritonAttentionImpl") def forward( self, From 7297941b383a62d0212186f4615db857cd932b0c Mon Sep 17 00:00:00 2001 From: Edwin Hernandez Date: Thu, 20 Mar 2025 21:18:47 -0700 Subject: [PATCH 17/40] [Doc] Update LWS docs (#15163) Signed-off-by: Edwinhr716 --- docs/source/deployment/frameworks/lws.md | 191 ++++++++++++++++++++++- 1 file changed, 189 insertions(+), 2 deletions(-) diff --git a/docs/source/deployment/frameworks/lws.md b/docs/source/deployment/frameworks/lws.md index 349fa83fbcb9d..4e9a03b5c4c17 100644 --- a/docs/source/deployment/frameworks/lws.md +++ b/docs/source/deployment/frameworks/lws.md @@ -7,5 +7,192 @@ A major use case is for multi-host/multi-node distributed inference. vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving. -Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on -deploying vLLM on Kubernetes using LWS. +## Prerequisites + +* At least two Kubernetes nodes, each with 8 GPUs, are required. +* Install LWS by following the instructions found [here](https://lws.sigs.k8s.io/docs/installation/). + +## Deploy and Serve + +Deploy the following yaml file `lws.yaml` + +```yaml +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: vllm +spec: + replicas: 2 + leaderWorkerTemplate: + size: 2 + restartPolicy: RecreateGroupOnPodRestart + leaderTemplate: + metadata: + labels: + role: leader + spec: + containers: + - name: vllm-leader + image: docker.io/vllm/vllm-openai:latest + env: + - name: HUGGING_FACE_HUB_TOKEN + value: + command: + - sh + - -c + - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); + python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2" + resources: + limits: + nvidia.com/gpu: "8" + memory: 1124Gi + ephemeral-storage: 800Gi + requests: + ephemeral-storage: 800Gi + cpu: 125 + ports: + - containerPort: 8080 + readinessProbe: + tcpSocket: + port: 8080 + initialDelaySeconds: 15 + periodSeconds: 10 + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi + workerTemplate: + spec: + containers: + - name: vllm-worker + image: docker.io/vllm/vllm-openai:latest + command: + - sh + - -c + - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)" + resources: + limits: + nvidia.com/gpu: "8" + memory: 1124Gi + ephemeral-storage: 800Gi + requests: + ephemeral-storage: 800Gi + cpu: 125 + env: + - name: HUGGING_FACE_HUB_TOKEN + value: + volumeMounts: + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-leader +spec: + ports: + - name: http + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + leaderworkerset.sigs.k8s.io/name: vllm + role: leader + type: ClusterIP +``` + +```bash +kubectl apply -f lws.yaml +``` + +Verify the status of the pods: + +```bash +kubectl get pods +``` + +Should get an output similar to this: + +```bash +NAME READY STATUS RESTARTS AGE +vllm-0 1/1 Running 0 2s +vllm-0-1 1/1 Running 0 2s +vllm-1 1/1 Running 0 2s +vllm-1-1 1/1 Running 0 2s +``` + +Verify that the distributed tensor-parallel inference works: + +```bash +kubectl logs vllm-0 |grep -i "Loading model weights took" +``` + +Should get something similar to this: + +```text +INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB +(RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB +``` + +## Access ClusterIP service + +```bash +# Listen on port 8080 locally, forwarding to the targetPort of the service's port 8080 in a pod selected by the service +kubectl port-forward svc/vllm-leader 8080:8080 +``` + +The output should be similar to the following: + +```text +Forwarding from 127.0.0.1:8080 -> 8080 +Forwarding from [::1]:8080 -> 8080 +``` + +## Serve the model + +Open another terminal and send a request + +```text +curl http://localhost:8080/v1/completions \ +-H "Content-Type: application/json" \ +-d '{ + "model": "meta-llama/Meta-Llama-3.1-405B-Instruct", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 +}' +``` + +The output should be similar to the following + +```text +{ + "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d", + "object": "text_completion", + "created": 1715138766, + "model": "meta-llama/Meta-Llama-3.1-405B-Instruct", + "choices": [ + { + "index": 0, + "text": " top destination for foodies, with", + "logprobs": null, + "finish_reason": "length", + "stop_reason": null + } + ], + "usage": { + "prompt_tokens": 5, + "total_tokens": 12, + "completion_tokens": 7 + } +} +``` From da6ea29f7a60b75ac4fe7b0ae5204ba96298011c Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 20 Mar 2025 22:24:10 -0700 Subject: [PATCH 18/40] [V1] Avoid redundant input processing in n>1 case (#14985) Signed-off-by: Nick Hill --- tests/lora/test_tokenizer_group.py | 6 +- tests/tokenization/test_tokenizer_group.py | 27 +++----- vllm/engine/async_llm_engine.py | 1 - vllm/engine/llm_engine.py | 1 - vllm/engine/protocol.py | 5 +- vllm/inputs/preprocess.py | 63 +++---------------- .../tokenizer_group/base_tokenizer_group.py | 2 - .../tokenizer_group/ray_tokenizer_group.py | 10 +-- .../tokenizer_group/tokenizer_group.py | 2 - vllm/v1/engine/async_llm.py | 57 ++++++++++------- vllm/v1/engine/llm_engine.py | 42 ++++++++----- vllm/v1/engine/parallel_sampling.py | 13 +--- vllm/v1/engine/processor.py | 1 - 13 files changed, 85 insertions(+), 145 deletions(-) diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index 589167e801f19..d605ab7346889 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -24,12 +24,10 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type): ) lora_request = LoRARequest("1", 1, sql_lora_files) assert reference_tokenizer.encode("prompt") == tokenizer_group.encode( - request_id="request_id", prompt="prompt", lora_request=lora_request) + prompt="prompt", lora_request=lora_request) assert reference_tokenizer.encode( "prompt") == await tokenizer_group.encode_async( - request_id="request_id", - prompt="prompt", - lora_request=lora_request) + prompt="prompt", lora_request=lora_request) assert isinstance(tokenizer_group.get_lora_tokenizer(None), PreTrainedTokenizerBase) assert tokenizer_group.get_lora_tokenizer( diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index d1873823ac187..5b62f992c1be7 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -41,10 +41,10 @@ async def test_tokenizer_group(tokenizer_group_type): max_input_length=None, ) assert reference_tokenizer.encode("prompt") == tokenizer_group.encode( - request_id="request_id", prompt="prompt", lora_request=None) + prompt="prompt", lora_request=None) assert reference_tokenizer.encode( - "prompt") == await tokenizer_group.encode_async( - request_id="request_id", prompt="prompt", lora_request=None) + "prompt") == await tokenizer_group.encode_async(prompt="prompt", + lora_request=None) assert isinstance(tokenizer_group.get_lora_tokenizer(None), PreTrainedTokenizerBase) assert tokenizer_group.get_lora_tokenizer( @@ -69,8 +69,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type): # and check that all requests are processed correctly. num_requests = tokenizer_group_pool.pool_size * 5 requests = [ - tokenizer_group_pool.encode_async(request_id=str(i), - prompt=f"prompt {i}", + tokenizer_group_pool.encode_async(prompt=f"prompt {i}", lora_request=None) for i in range(num_requests) ] @@ -161,12 +160,8 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type): fail_at[0] = 1000 # We should recover successfully. - await tokenizer_group_pool.encode_async(request_id="1", - prompt="prompt", - lora_request=None) - await tokenizer_group_pool.encode_async(request_id="1", - prompt="prompt", - lora_request=None) + await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None) + await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None) # Check that we have a new actor assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors) @@ -184,8 +179,7 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type): # We should fail after re-initialization. with pytest.raises(RuntimeError): - await tokenizer_group_pool.encode_async(request_id="1", - prompt="prompt", + await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None) # check_health should raise the same thing @@ -206,11 +200,8 @@ async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type): # Prompt too long error with pytest.raises(ValueError): - await tokenizer_group_pool.encode_async(request_id="1", - prompt="prompt" * 100, + await tokenizer_group_pool.encode_async(prompt="prompt" * 100, lora_request=None) - await tokenizer_group_pool.encode_async(request_id="1", - prompt="prompt", - lora_request=None) + await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None) # Actors should stay the same. assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index c6fafbeea9c1d..91b9cc62719a4 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -492,7 +492,6 @@ class _AsyncLLMEngine(LLMEngine): preprocessed_inputs = await self.input_preprocessor.preprocess_async( prompt, - request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 51a82c415a8ae..b9a8b6a530656 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -783,7 +783,6 @@ class LLMEngine: preprocessed_inputs = self.input_preprocessor.preprocess( prompt, - request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index be9f3af0b5452..d2f2c226d2fc4 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -81,10 +81,7 @@ class EngineClient(ABC): if is_explicit_encoder_decoder_prompt(prompt): raise NotImplementedError else: - processed_inputs = preprocessor._prompt_to_llm_inputs( - prompt, - request_id=request_id, - ) + processed_inputs = preprocessor._prompt_to_llm_inputs(prompt) prompt_token_ids = processed_inputs["prompt_token_ids"] prompt_text = processed_inputs.get("prompt") diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index af35e43d825a2..33f39bedea5b5 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -182,7 +182,6 @@ class InputPreprocessor: def _tokenize_prompt( self, prompt: str, - request_id: str, lora_request: Optional[LoRARequest], ) -> list[int]: """ @@ -202,15 +201,13 @@ class InputPreprocessor: "do_lower_case", False)): prompt = prompt.lower() - return tokenizer.encode(request_id=request_id, - prompt=prompt, + return tokenizer.encode(prompt=prompt, lora_request=lora_request, add_special_tokens=add_special_tokens) async def _tokenize_prompt_async( self, prompt: str, - request_id: str, lora_request: Optional[LoRARequest], ) -> list[int]: """Async version of :meth:`_tokenize_prompt`.""" @@ -222,7 +219,6 @@ class InputPreprocessor: # appending an EOS token to the prompt which disrupts generation. add_special_tokens = False return await tokenizer.encode_async( - request_id=request_id, prompt=prompt, lora_request=lora_request, add_special_tokens=add_special_tokens) @@ -309,7 +305,6 @@ class InputPreprocessor: def _prompt_to_llm_inputs( self, prompt: SingletonPrompt, - request_id: str, lora_request: Optional[LoRARequest] = None, return_mm_hashes: bool = False, ) -> SingletonInputs: @@ -318,7 +313,6 @@ class InputPreprocessor: Arguments: - * request_id * prompt: single encoder or decoder input prompt * lora_request: this is only valid for decoder prompts * return_mm_hashes: whether to return multimodal hashes @@ -333,7 +327,6 @@ class InputPreprocessor: prompt_text = parsed["content"] prompt_token_ids = self._tokenize_prompt( prompt_text, - request_id=request_id, lora_request=lora_request, ) @@ -384,7 +377,6 @@ class InputPreprocessor: prompt_token_ids = self._tokenize_prompt( prompt_text, - request_id=request_id, lora_request=lora_request, ) @@ -400,7 +392,6 @@ class InputPreprocessor: async def _prompt_to_llm_inputs_async( self, prompt: SingletonPrompt, - request_id: str, lora_request: Optional[LoRARequest] = None, return_mm_hashes: bool = False, ) -> SingletonInputs: @@ -411,7 +402,6 @@ class InputPreprocessor: prompt_text = parsed["content"] prompt_token_ids = await self._tokenize_prompt_async( prompt_text, - request_id=request_id, lora_request=lora_request, ) @@ -460,7 +450,6 @@ class InputPreprocessor: prompt_token_ids = await self._tokenize_prompt_async( prompt_text, - request_id=request_id, lora_request=lora_request, ) @@ -560,7 +549,6 @@ class InputPreprocessor: def _process_encoder_decoder_prompt( self, prompt: PromptType, - request_id: str, ) -> EncoderDecoderInputs: """ For encoder/decoder models only: @@ -587,7 +575,6 @@ class InputPreprocessor: Arguments: * prompt: an input prompt - * request_id Returns: @@ -598,16 +585,11 @@ class InputPreprocessor: if is_explicit_encoder_decoder_prompt(prompt): encoder_inputs = self._prompt_to_llm_inputs( - prompt["encoder_prompt"], - request_id=request_id, - ) + prompt["encoder_prompt"]) if (decoder_input := prompt["decoder_prompt"]) is None: decoder_inputs = None else: - decoder_inputs = self._prompt_to_llm_inputs( - decoder_input, - request_id=request_id, - ) + decoder_inputs = self._prompt_to_llm_inputs(decoder_input) # For multimodal model, override decoder prompt from processor # with explicit decoder prompt. if self.model_config.is_multimodal_model and ( @@ -616,10 +598,7 @@ class InputPreprocessor: self._separate_enc_dec_inputs_from_mm_processor_outputs( encoder_inputs, decoder_inputs)) else: - inputs = self._prompt_to_llm_inputs( - prompt, - request_id=request_id, - ) + inputs = self._prompt_to_llm_inputs(prompt) if self.model_config.is_multimodal_model and ( self._can_process_multimodal()): # Encoder-Decoder Multimodal model @@ -636,7 +615,6 @@ class InputPreprocessor: async def _process_encoder_decoder_prompt_async( self, prompt: PromptType, - request_id: str, ) -> EncoderDecoderInputs: """Async version of :meth:`_process_encoder_decoder_prompt`.""" encoder_inputs: SingletonInputs @@ -644,18 +622,13 @@ class InputPreprocessor: if is_explicit_encoder_decoder_prompt(prompt): encoder_task = self._prompt_to_llm_inputs_async( - prompt["encoder_prompt"], - request_id=request_id, - ) + prompt["encoder_prompt"]) if (decoder_input := prompt["decoder_prompt"]) is None: encoder_inputs = await encoder_task decoder_inputs = None else: - decoder_task = self._prompt_to_llm_inputs_async( - decoder_input, - request_id=request_id, - ) + decoder_task = self._prompt_to_llm_inputs_async(decoder_input) encoder_inputs, decoder_inputs = await asyncio.gather( encoder_task, decoder_task) @@ -668,10 +641,7 @@ class InputPreprocessor: self._separate_enc_dec_inputs_from_mm_processor_outputs( encoder_inputs, decoder_inputs)) else: - inputs = await self._prompt_to_llm_inputs_async( - prompt, - request_id=request_id, - ) + inputs = await self._prompt_to_llm_inputs_async(prompt) if self.model_config.is_multimodal_model and ( self._can_process_multimodal()): # Encoder-Decoder Multimodal model @@ -704,7 +674,6 @@ class InputPreprocessor: def _process_decoder_only_prompt( self, prompt: SingletonPrompt, - request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, @@ -716,7 +685,6 @@ class InputPreprocessor: Arguments: * prompt: input prompt - * request_id * lora_request * prompt_adapter_request * return_mm_hashes @@ -728,7 +696,6 @@ class InputPreprocessor: prompt_comps = self._prompt_to_llm_inputs( prompt, - request_id=request_id, lora_request=lora_request, return_mm_hashes=return_mm_hashes, ) @@ -741,7 +708,6 @@ class InputPreprocessor: async def _process_decoder_only_prompt_async( self, prompt: SingletonPrompt, - request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, @@ -749,7 +715,6 @@ class InputPreprocessor: """Async version of :meth:`_process_decoder_only_prompt`.""" prompt_comps = await self._prompt_to_llm_inputs_async( prompt, - request_id=request_id, lora_request=lora_request, return_mm_hashes=return_mm_hashes, ) @@ -762,7 +727,6 @@ class InputPreprocessor: def preprocess( self, prompt: PromptType, - request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, @@ -774,10 +738,7 @@ class InputPreprocessor: "returned until they are supported on vLLM V1.") # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder - return self._process_encoder_decoder_prompt( - prompt, - request_id=request_id, - ) + return self._process_encoder_decoder_prompt(prompt) if is_explicit_encoder_decoder_prompt(prompt): raise ValueError("Cannot pass encoder-decoder prompt " @@ -786,7 +747,6 @@ class InputPreprocessor: # Decoder-only operation return self._process_decoder_only_prompt( prompt, - request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, return_mm_hashes=return_mm_hashes, @@ -795,7 +755,6 @@ class InputPreprocessor: async def preprocess_async( self, prompt: PromptType, - request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, return_mm_hashes: bool = False, @@ -807,10 +766,7 @@ class InputPreprocessor: "returned until they are supported on vLLM V1.") # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder - return await self._process_encoder_decoder_prompt_async( - prompt, - request_id=request_id, - ) + return await self._process_encoder_decoder_prompt_async(prompt) if is_explicit_encoder_decoder_prompt(prompt): raise ValueError("Cannot pass encoder-decoder prompt " @@ -819,7 +775,6 @@ class InputPreprocessor: # Decoder-only operation return await self._process_decoder_only_prompt_async( prompt, - request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, return_mm_hashes=return_mm_hashes, diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py index fbdfa3e57e172..c5108a7fc6ebc 100644 --- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py @@ -33,7 +33,6 @@ class BaseTokenizerGroup(ABC): @abstractmethod def encode(self, prompt: str, - request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group.""" @@ -43,7 +42,6 @@ class BaseTokenizerGroup(ABC): async def encode_async( self, prompt: str, - request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group.""" diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py index 30cab752ccf3c..b048b8094174a 100644 --- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -113,7 +113,6 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): def encode(self, prompt: str, - request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group. @@ -133,8 +132,7 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): original_actor = actor try: ret = ray.get( - actor.encode.remote(request_id=request_id, - prompt=prompt, + actor.encode.remote(prompt=prompt, lora_request=lora_request, add_special_tokens=add_special_tokens)) except ActorDiedError as e: @@ -145,8 +143,7 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): actor = self._init_actor() try: ret = ray.get( - actor.encode.remote(request_id=request_id, - prompt=prompt, + actor.encode.remote(prompt=prompt, lora_request=lora_request, add_special_tokens=add_special_tokens)) except ActorDiedError as e: @@ -164,7 +161,6 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): async def encode_async( self, prompt: str, - request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group. @@ -184,7 +180,6 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): original_actor = actor try: ret = await actor.encode.remote( - request_id=request_id, prompt=prompt, lora_request=lora_request, add_special_tokens=add_special_tokens) @@ -196,7 +191,6 @@ class RayTokenizerGroupPool(BaseTokenizerGroup): actor = self._init_actor() try: ret = await actor.encode.remote( - request_id=request_id, prompt=prompt, lora_request=lora_request, add_special_tokens=add_special_tokens) diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index 025971cb7e477..b6e9005bcd241 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -56,7 +56,6 @@ class TokenizerGroup(BaseTokenizerGroup): def encode(self, prompt: str, - request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, add_special_tokens: Optional[bool] = None) -> List[int]: tokenizer = self.get_lora_tokenizer(lora_request) @@ -69,7 +68,6 @@ class TokenizerGroup(BaseTokenizerGroup): async def encode_async( self, prompt: str, - request_id: Optional[str] = None, lora_request: Optional[LoRARequest] = None, add_special_tokens: Optional[bool] = None) -> List[int]: tokenizer = await self.get_lora_tokenizer_async(lora_request) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 171c1c7da28e8..e0169f1a4ded8 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -4,6 +4,7 @@ import asyncio import logging import os from collections.abc import AsyncGenerator, Mapping +from copy import copy from typing import Optional, Union import numpy as np @@ -25,6 +26,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils import Device, cdiv, kill_process_tree +from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.parallel_sampling import ParentRequest @@ -177,34 +179,45 @@ class AsyncLLM(EngineClient): ) -> asyncio.Queue[RequestOutput]: """Add new request to the AsyncLLM.""" - # 1) Create a new output queue for the request. + # Create a new output queue for the request. queue: asyncio.Queue[RequestOutput] = asyncio.Queue() - # 2) Fan out child requests (for n>1) - parent_req = ParentRequest.from_params(request_id, params) + # Convert Input --> Request. + request = self.processor.process_inputs(request_id, prompt, params, + arrival_time, lora_request, + trace_headers, + prompt_adapter_request, + priority) + n = params.n if isinstance(params, SamplingParams) else 1 + + if n == 1: + await self._add_request(request, None, 0, queue) + return queue + + # Fan out child requests (for n>1). + parent_request = ParentRequest(request_id, params) for idx in range(n): - if parent_req is not None: - request_id, params = parent_req.get_child_info(idx) - - # 3) Convert Input --> Request. - request = self.processor.process_inputs(request_id, prompt, params, - arrival_time, lora_request, - trace_headers, - prompt_adapter_request, - priority) - - # 4) Add the request to OutputProcessor (this process). - self.output_processor.add_request(request, parent_req, idx, queue) - - # 5) Add the EngineCoreRequest to EngineCore (separate process). - await self.engine_core.add_request_async(request) - - if self.log_requests: - logger.info("Added request %s.", request_id) - + request_id, params = parent_request.get_child_info(idx) + child_request = request if idx == n - 1 else copy(request) + child_request.request_id = request_id + child_request.sampling_params = params + await self._add_request(child_request, parent_request, idx, queue) return queue + async def _add_request(self, request: EngineCoreRequest, + parent_req: Optional[ParentRequest], index: int, + queue: asyncio.Queue[RequestOutput]): + + # Add the request to OutputProcessor (this process). + self.output_processor.add_request(request, parent_req, index, queue) + + # Add the EngineCoreRequest to EngineCore (separate process). + await self.engine_core.add_request_async(request) + + if self.log_requests: + logger.info("Added request %s.", request.request_id) + # TODO: we should support multiple prompts in one call, as you # can do with LLM.generate. So that for multi-prompt completion # requests we don't need to send multiple messages to core proc, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 14338e5cbe88f..7bda3a30d2028 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Mapping +from copy import copy from typing import Optional, Union from typing_extensions import TypeVar @@ -179,25 +180,34 @@ class LLMEngine: prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> None: - # 1) Fan out child requests (for n>1) - parent_req = ParentRequest.from_params(request_id, params) + # Process raw inputs into the request. + request = self.processor.process_inputs(request_id, prompt, params, + arrival_time, lora_request, + trace_headers, + prompt_adapter_request, + priority) + n = params.n if isinstance(params, SamplingParams) else 1 - for idx in range(n): - if parent_req is not None: - request_id, params = parent_req.get_child_info(idx) - # 2) Process raw inputs into the request. - request = self.processor.process_inputs(request_id, prompt, params, - arrival_time, lora_request, - trace_headers, - prompt_adapter_request, - priority) - - # 3) Make a new RequestState and queue. - self.output_processor.add_request(request, parent_req, idx) - - # 3) Add the request to EngineCore. + if n == 1: + # Make a new RequestState and queue. + self.output_processor.add_request(request, None, 0) + # Add the request to EngineCore. self.engine_core.add_request(request) + return + + # Fan out child requests (for n>1). + parent_req = ParentRequest(request_id, params) + for idx in range(n): + request_id, params = parent_req.get_child_info(idx) + child_request = request if idx == n - 1 else copy(request) + child_request.request_id = request_id + child_request.sampling_params = params + + # Make a new RequestState and queue. + self.output_processor.add_request(child_request, parent_req, idx) + # Add the request to EngineCore. + self.engine_core.add_request(child_request) def step(self) -> list[RequestOutput]: diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py index 0eeca657406e5..4df7ca59731ec 100644 --- a/vllm/v1/engine/parallel_sampling.py +++ b/vllm/v1/engine/parallel_sampling.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 from copy import copy -from typing import Optional, Union +from typing import Optional from vllm.outputs import CompletionOutput -from vllm.pooling_params import PoolingParams from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.v1.metrics.stats import IterationStats @@ -43,16 +42,6 @@ class ParentRequest: self.max_num_generation_tokens = 0 self.cached_child_sampling_params = None - @classmethod - def from_params( - cls, - request_id: str, - params: Union[SamplingParams, PoolingParams], - ) -> Optional['ParentRequest']: - if not isinstance(params, SamplingParams) or params.n == 1: - return None - return cls(request_id, params) - def _get_child_sampling_params( self, index: int, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index d823c45d59906..55e0fdcd65b5e 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -173,7 +173,6 @@ class Processor: # 3. Apply prompt adapter to prompt token ids if one exists. processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( prompt, - request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, return_mm_hashes=self.use_hash, From 0fa3970debe14162b5d2fcbbe9acd80beaf08f92 Mon Sep 17 00:00:00 2001 From: Wei Zeng <48810492+wayzeng@users.noreply.github.com> Date: Fri, 21 Mar 2025 00:26:03 -0700 Subject: [PATCH 19/40] [Feature] specify model in config.yaml (#14855) Signed-off-by: weizeng --- .../serving/openai_compatible_server.md | 4 +- tests/{data => config}/test_config.yaml | 0 tests/config/test_config_with_model.yaml | 7 +++ tests/conftest.py | 12 +++++ tests/test_utils.py | 53 +++++++++++++++---- vllm/entrypoints/cli/serve.py | 22 ++++---- vllm/utils.py | 34 ++++++++---- 7 files changed, 102 insertions(+), 30 deletions(-) rename tests/{data => config}/test_config.yaml (100%) create mode 100644 tests/config/test_config_with_model.yaml diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index a6ec05f45b69b..378405d3690f2 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -184,6 +184,7 @@ For example: ```yaml # config.yaml +model: meta-llama/Llama-3.1-8B-Instruct host: "127.0.0.1" port: 6379 uvicorn-log-level: "info" @@ -192,12 +193,13 @@ uvicorn-log-level: "info" To use the above config file: ```bash -vllm serve SOME_MODEL --config config.yaml +vllm serve --config config.yaml ``` :::{note} In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence. The order of priorities is `command line > config file values > defaults`. +e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file. ::: ## API Reference diff --git a/tests/data/test_config.yaml b/tests/config/test_config.yaml similarity index 100% rename from tests/data/test_config.yaml rename to tests/config/test_config.yaml diff --git a/tests/config/test_config_with_model.yaml b/tests/config/test_config_with_model.yaml new file mode 100644 index 0000000000000..d8c8c7bc8162a --- /dev/null +++ b/tests/config/test_config_with_model.yaml @@ -0,0 +1,7 @@ +# Same as test_config.yaml but with model specified +model: config-model +port: 12312 +served_model_name: mymodel +tensor_parallel_size: 2 +trust_remote_code: true +multi_step_stream_outputs: false diff --git a/tests/conftest.py b/tests/conftest.py index 0c71d98101647..8c6046c5817a1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1121,3 +1121,15 @@ def pytest_collection_modifyitems(config, items): for item in items: if "optional" in item.keywords: item.add_marker(skip_optional) + + +@pytest.fixture(scope="session") +def cli_config_file(): + """Return the path to the CLI config file.""" + return os.path.join(_TEST_DIR, "config", "test_config.yaml") + + +@pytest.fixture(scope="session") +def cli_config_file_with_model(): + """Return the path to the CLI config file with model.""" + return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml") diff --git a/tests/test_utils.py b/tests/test_utils.py index 3660cfa0e49e2..1c33f83c59366 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,7 +8,7 @@ from unittest.mock import patch import pytest import torch -from vllm_test_utils import monitor +from vllm_test_utils.monitor import monitor from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.utils import (FlexibleArgumentParser, MemorySnapshot, @@ -140,7 +140,8 @@ def parser(): def parser_with_config(): parser = FlexibleArgumentParser() parser.add_argument('serve') - parser.add_argument('model_tag') + parser.add_argument('model_tag', nargs='?') + parser.add_argument('--model', type=str) parser.add_argument('--served-model-name', type=str) parser.add_argument('--config', type=str) parser.add_argument('--port', type=int) @@ -196,29 +197,29 @@ def test_missing_required_argument(parser): parser.parse_args([]) -def test_cli_override_to_config(parser_with_config): +def test_cli_override_to_config(parser_with_config, cli_config_file): args = parser_with_config.parse_args([ - 'serve', 'mymodel', '--config', './data/test_config.yaml', + 'serve', 'mymodel', '--config', cli_config_file, '--tensor-parallel-size', '3' ]) assert args.tensor_parallel_size == 3 args = parser_with_config.parse_args([ 'serve', 'mymodel', '--tensor-parallel-size', '3', '--config', - './data/test_config.yaml' + cli_config_file ]) assert args.tensor_parallel_size == 3 assert args.port == 12312 args = parser_with_config.parse_args([ 'serve', 'mymodel', '--tensor-parallel-size', '3', '--config', - './data/test_config.yaml', '--port', '666' + cli_config_file, '--port', '666' ]) assert args.tensor_parallel_size == 3 assert args.port == 666 -def test_config_args(parser_with_config): +def test_config_args(parser_with_config, cli_config_file): args = parser_with_config.parse_args( - ['serve', 'mymodel', '--config', './data/test_config.yaml']) + ['serve', 'mymodel', '--config', cli_config_file]) assert args.tensor_parallel_size == 2 assert args.trust_remote_code assert not args.multi_step_stream_outputs @@ -240,10 +241,9 @@ def test_config_file(parser_with_config): ]) -def test_no_model_tag(parser_with_config): +def test_no_model_tag(parser_with_config, cli_config_file): with pytest.raises(ValueError): - parser_with_config.parse_args( - ['serve', '--config', './data/test_config.yaml']) + parser_with_config.parse_args(['serve', '--config', cli_config_file]) # yapf: enable @@ -476,3 +476,34 @@ def test_swap_dict_values(obj, key1, key2): assert obj[key1] == original_obj[key2] else: assert key1 not in obj + + +def test_model_specification(parser_with_config, + cli_config_file, + cli_config_file_with_model): + # Test model in CLI takes precedence over config + args = parser_with_config.parse_args([ + 'serve', 'cli-model', '--config', cli_config_file_with_model + ]) + assert args.model_tag == 'cli-model' + assert args.served_model_name == 'mymodel' + + # Test model from config file works + args = parser_with_config.parse_args([ + 'serve', '--config', cli_config_file_with_model + ]) + assert args.model == 'config-model' + assert args.served_model_name == 'mymodel' + + # Test no model specified anywhere raises error + with pytest.raises(ValueError, match="No model specified!"): + parser_with_config.parse_args(['serve', '--config', cli_config_file]) + + # Test other config values are preserved + args = parser_with_config.parse_args([ + 'serve', 'cli-model', '--config', cli_config_file_with_model + ]) + assert args.tensor_parallel_size == 2 + assert args.trust_remote_code is True + assert args.multi_step_stream_outputs is False + assert args.port == 12312 diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index c345ece4dada9..cf05eb09b37ad 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -21,14 +21,16 @@ class ServeSubcommand(CLISubcommand): @staticmethod def cmd(args: argparse.Namespace) -> None: - # The default value of `--model` - if args.model != EngineArgs.model: - raise ValueError( - "With `vllm serve`, you should provide the model as a " - "positional argument instead of via the `--model` option.") + # If model is specified in CLI (as positional arg), it takes precedence + if hasattr(args, 'model_tag') and args.model_tag is not None: + args.model = args.model_tag + # Otherwise use model from config (already in args.model) - # EngineArgs expects the model name to be passed as --model. - args.model = args.model_tag + # Check if we have a model specified somewhere + if args.model == EngineArgs.model: # Still has default value + raise ValueError( + "With `vllm serve`, you should provide the model either as a " + "positional argument or in config file.") uvloop.run(run_server(args)) @@ -41,10 +43,12 @@ class ServeSubcommand(CLISubcommand): serve_parser = subparsers.add_parser( "serve", help="Start the vLLM OpenAI Compatible API server", - usage="vllm serve [options]") + usage="vllm serve [model_tag] [options]") serve_parser.add_argument("model_tag", type=str, - help="The model tag to serve") + nargs='?', + help="The model tag to serve " + "(optional if specified in config)") serve_parser.add_argument( "--config", type=str, diff --git a/vllm/utils.py b/vllm/utils.py index cb375f8ff3228..b723637b25013 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1264,19 +1264,29 @@ class FlexibleArgumentParser(argparse.ArgumentParser): config_args = self._load_config_file(file_path) # 0th index is for {serve,chat,complete} - # followed by model_tag (only for serve) + # optionally followed by model_tag (only for serve) # followed by config args # followed by rest of cli args. # maintaining this order will enforce the precedence # of cli > config > defaults if args[0] == "serve": - if index == 1: + model_in_cli = len(args) > 1 and not args[1].startswith('-') + model_in_config = any(arg == '--model' for arg in config_args) + + if not model_in_cli and not model_in_config: raise ValueError( - "No model_tag specified! Please check your command-line" - " arguments.") - args = [args[0]] + [ - args[1] - ] + config_args + args[2:index] + args[index + 2:] + "No model specified! Please specify model either in " + "command-line arguments or in config file.") + + if model_in_cli: + # Model specified as positional arg, keep CLI version + args = [args[0]] + [ + args[1] + ] + config_args + args[2:index] + args[index + 2:] + else: + # No model in CLI, use config if available + args = [args[0] + ] + config_args + args[1:index] + args[index + 2:] else: args = [args[0]] + config_args + args[1:index] + args[index + 2:] @@ -1294,9 +1304,7 @@ class FlexibleArgumentParser(argparse.ArgumentParser): '--port': '12323', '--tensor-parallel-size': '4' ] - """ - extension: str = file_path.split('.')[-1] if extension not in ('yaml', 'yml'): raise ValueError( @@ -1321,7 +1329,15 @@ class FlexibleArgumentParser(argparse.ArgumentParser): if isinstance(action, StoreBoolean) ] + # Skip model from config if it's provided as positional argument + skip_model = (hasattr(self, '_parsed_args') and self._parsed_args + and len(self._parsed_args) > 1 + and self._parsed_args[0] == 'serve' + and not self._parsed_args[1].startswith('-')) + for key, value in config.items(): + if skip_model and key == 'model': + continue if isinstance(value, bool) and key not in store_boolean_arguments: if value: processed_args.append('--' + key) From a989ca2bf6398b63607637892240375c63d76a9e Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Fri, 21 Mar 2025 16:58:28 +0800 Subject: [PATCH 20/40] [Bugfix] Add int8 torch dtype for KVCache (#15260) Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/utils.py b/vllm/utils.py index b723637b25013..55ee044b482cd 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -153,6 +153,7 @@ STR_DTYPE_TO_TORCH_DTYPE = { "fp8": torch.uint8, "fp8_e4m3": torch.uint8, "fp8_e5m2": torch.uint8, + "int8": torch.int8, } TORCH_DTYPE_TO_NUMPY_DTYPE = { From 47c7126213163e454019efc7d913125c22df9d6e Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 21 Mar 2025 18:32:33 +0800 Subject: [PATCH 21/40] [Misc] Add attention mask pre-computation optimization back to Qwen2.5-VL (#15273) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/qwen2_5_vl.py | 33 +++++++++++++++++------- vllm/model_executor/models/qwen2_vl.py | 18 ++++++++----- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index adca97c71c581..1e6ff1fec6d5c 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -608,6 +608,17 @@ class Qwen2_5_VisionTransformer(nn.Module): window_index = torch.cat(window_index, dim=0) return window_index, cu_window_seqlens + def compute_attn_mask_seqlen( + self, + cu_seqlens: torch.Tensor, + ) -> tuple[Optional[int], Optional[list[int]]]: + max_seqlen, seqlens = None, None + if self.attn_backend == _Backend.FLASH_ATTN: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + elif self.attn_backend == _Backend.XFORMERS: + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + return max_seqlen, seqlens + def forward( self, x: torch.Tensor, @@ -645,25 +656,27 @@ class Qwen2_5_VisionTransformer(nn.Module): # transformers hidden_states = hidden_states.unsqueeze(1) - max_seqlen = None - seqlens = None + # pre-compute seqlens for window/full attn to reduce cuMemcpy operations + max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen( + cu_seqlens) + max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen( + cu_window_seqlens) for layer_num, blk in enumerate(self.blocks): if layer_num in self.fullatt_block_indexes: cu_seqlens_now = cu_seqlens + max_seqlen_now = max_seqlen_full + seqlens_now = seqlens_full else: cu_seqlens_now = cu_window_seqlens - # pre-compute cu_seqlens for window attn - if self.attn_backend == _Backend.FLASH_ATTN: - max_seqlen = (cu_seqlens_now[1:] - - cu_seqlens_now[:-1]).max().item() - elif self.attn_backend == _Backend.XFORMERS: - seqlens = (cu_seqlens_now[1:] - cu_seqlens_now[:-1]).tolist() + max_seqlen_now = max_seqlen_window + seqlens_now = seqlens_window + hidden_states = blk( hidden_states, cu_seqlens=cu_seqlens_now, rotary_pos_emb=rotary_pos_emb, - max_seqlen=max_seqlen, - seqlens=seqlens, + max_seqlen=max_seqlen_now, + seqlens=seqlens_now, ) # For Qwen2.5-VL-3B, float16 will overflow at last block diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b8ac40b7e7f9b..7537671e1bb82 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -617,6 +617,16 @@ class Qwen2VisionTransformer(nn.Module): rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) return rotary_pos_emb + def compute_attn_mask_seqlen( + self, cu_seqlens: torch.Tensor + ) -> tuple[Optional[int], Optional[list[int]]]: + max_seqlen, seqlens = None, None + if self.attn_backend == _Backend.FLASH_ATTN: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + elif self.attn_backend == _Backend.XFORMERS: + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + return max_seqlen, seqlens + def forward( self, x: torch.Tensor, @@ -638,12 +648,8 @@ class Qwen2VisionTransformer(nn.Module): # transformers x = x.unsqueeze(1) - max_seqlen = None - seqlens = None - if self.attn_backend == _Backend.FLASH_ATTN: - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() - elif self.attn_backend == _Backend.XFORMERS: - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + # pre-compute seqlens for attn mask to reduce cuMemcpy operations + max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) for blk in self.blocks: x = blk( x, From 84e00adc8a0807487beebae44b900c2e9cb987ac Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 21 Mar 2025 18:54:08 +0800 Subject: [PATCH 22/40] [Bugfix] Fix incorrect resolving order for transformers fallback (#15279) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/registry.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8b469132da6d3..74ae06c55d88e 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -418,11 +418,13 @@ class _ModelRegistry: if not architectures: logger.warning("No model architectures are specified") - normalized_arch = [] - for model in architectures: - if model not in self.models: - model = "TransformersModel" - normalized_arch.append(model) + # filter out support architectures + normalized_arch = list( + filter(lambda model: model in self.models, architectures)) + + # make sure Transformers fallback are put at the last + if len(normalized_arch) != len(architectures): + normalized_arch.append("TransformersModel") return normalized_arch def inspect_model_cls( From 91ca929dc7aa626c926cd992b62a45b47e610b6a Mon Sep 17 00:00:00 2001 From: Lehua Ding Date: Fri, 21 Mar 2025 18:54:11 +0800 Subject: [PATCH 23/40] [V1] Fix wrong import path of get_flash_attn_version (#15280) Signed-off-by: Lehua Ding --- vllm/v1/attention/backends/mla/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 188a425b107e4..31244443108b6 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -195,8 +195,8 @@ from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer, AttentionMetadata, MLAAttentionImpl) -from vllm.attention.backends.utils import get_flash_attn_version from vllm.attention.ops.triton_merge_attn_states import merge_attn_states +from vllm.fa_utils import get_flash_attn_version from vllm.logger import init_logger from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearBase, RowParallelLinear, From 8afcd0f6336b076f1e8328d664f03f41319a65b8 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 21 Mar 2025 19:42:06 +0800 Subject: [PATCH 24/40] [Bugfix] Fix broken kernel test due to missing rename for v1 Triton backend (#15282) Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/kernels/test_attention_selector.py | 2 +- tests/kernels/test_rocm_attention_selector.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 66db7509cc474..1615c23a4f714 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -49,7 +49,7 @@ def test_env( RocmPlatform()): backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) - EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" + EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" assert backend.get_name() == EXPECTED elif device == "openvino": with patch("vllm.attention.selector.current_platform", diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py index 724f0af283f70..90b483b4a41a0 100644 --- a/tests/kernels/test_rocm_attention_selector.py +++ b/tests/kernels/test_rocm_attention_selector.py @@ -26,7 +26,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch): # Test standard ROCm attention backend = get_attn_backend(16, torch.float16, torch.float16, 16, False) assert (backend.get_name() == "ROCM_FLASH" - or backend.get_name() == "ROCM_ATTN_VLLM_V1") + or backend.get_name() == "TRITON_ATTN_VLLM_V1") # mla test for deepseek related backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, From 61e8c183505355e2005314e7004a76049439e165 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 21 Mar 2025 07:56:09 -0400 Subject: [PATCH 25/40] [Misc] Add cProfile helpers (#15074) Signed-off-by: Russell Bryant --- .../contributing/profiling/profiling_index.md | 49 +++++++++++++++++++ vllm/utils.py | 48 ++++++++++++++++++ 2 files changed, 97 insertions(+) diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md index d6e597ea9e925..ce25daa39c5cb 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/source/contributing/profiling/profiling_index.md @@ -124,3 +124,52 @@ nsys stats report1.nsys-rep GUI example: Screenshot 2025-03-05 at 11 48 42 AM + +## Profiling vLLM Python Code + +The Python standard library includes +[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python +code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM. +Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be +used to profile a section of code. + +### Example usage - decorator + +The first helper is a Python decorator that can be used to profile a function. +If a filename is specified, the profile will be saved to that file. If no filename is +specified, profile data will be printed to stdout. + +```python +import vllm.utils + +@vllm.utils.cprofile("expensive_function.prof") +def expensive_function(): + # some expensive code + pass +``` + +### Example Usage - context manager + +The second helper is a context manager that can be used to profile a block of +code. Similar to the decorator, the filename is optional. + +```python +import vllm.utils + +def another_function(): + # more expensive code + pass + +with vllm.utils.cprofile_context("another_function.prof"): + another_function() +``` + +### Analyzing Profile Results + +There are multiple tools available that can help analyze the profile results. +One example is [snakeviz](https://jiffyclub.github.io/snakeviz/). + +```bash +pip install snakeviz +snakeviz expensive_function.prof +``` diff --git a/vllm/utils.py b/vllm/utils.py index 55ee044b482cd..64d9faeb1cb3d 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -2405,3 +2405,51 @@ def swap_dict_values(obj: dict[_K, _V], key1: _K, key2: _K) -> None: obj[key1] = v2 else: obj.pop(key1, None) + + +@contextlib.contextmanager +def cprofile_context(save_file: Optional[str] = None): + """Run a cprofile + + Args: + save_file: path to save the profile result. "1" or + None will result in printing to stdout. + """ + import cProfile + + prof = cProfile.Profile() + prof.enable() + + try: + yield + finally: + prof.disable() + if save_file and save_file != "1": + prof.dump_stats(save_file) + else: + prof.print_stats(sort="cumtime") + + +def cprofile(save_file: Optional[str] = None, enabled: bool = True): + """Decorator to profile a Python method using cProfile. + + Args: + save_file: Path to save the profile result. + If "1", None, or "", results will be printed to stdout. + enabled: Set to false to turn this into a no-op + """ + + def decorator(func: Callable): + + @wraps(func) + def wrapper(*args, **kwargs): + if not enabled: + # If profiling is disabled, just call the function directly. + return func(*args, **kwargs) + + with cprofile_context(save_file): + return func(*args, **kwargs) + + return wrapper + + return decorator From 93a00d7ddec29371efb4764d4c55065eca4c7746 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Fri, 21 Mar 2025 19:56:27 +0800 Subject: [PATCH 26/40] [v1] Refactor KVCacheConfig (#14079) Signed-off-by: Chen Zhang --- tests/v1/core/test_kv_cache_utils.py | 110 ++++++++++++++++++++++- vllm/v1/core/kv_cache_utils.py | 130 +++++++++++++++++++-------- vllm/v1/engine/core.py | 31 +++++-- vllm/v1/executor/abstract.py | 13 ++- vllm/v1/kv_cache_interface.py | 52 +++++++---- vllm/v1/worker/gpu_model_runner.py | 46 ++++++---- vllm/v1/worker/gpu_worker.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 40 +++++---- vllm/v1/worker/tpu_worker.py | 2 +- vllm/v1/worker/worker_base.py | 2 +- 10 files changed, 318 insertions(+), 110 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index ba08b83ec54e5..3fecb517c4369 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +import torch from vllm.multimodal.inputs import MultiModalKwargs from vllm.sampling_params import SamplingParams @@ -8,7 +9,10 @@ from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, generate_block_hash_extra_keys, hash_block_tokens, - hash_request_tokens) + hash_request_tokens, + unify_kv_cache_configs) +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec, KVCacheTensor) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -314,3 +318,107 @@ def test_metrics(): assert metrics.aggregated_query_total == 0 assert metrics.aggregated_query_hit == 0 assert not metrics.query_queue + + +def test_unify_kv_cache_configs(): + + def new_kv_cache_spec(block_size=16, + num_kv_heads=2, + head_size=64, + dtype=torch.float32, + use_mla=False): + return FullAttentionSpec(block_size=block_size, + num_kv_heads=num_kv_heads, + head_size=head_size, + dtype=dtype, + use_mla=use_mla) + + same_kv_cache_config = [ + KVCacheConfig( + num_blocks=10, + tensors={ + "layer1": KVCacheTensor(100), + "layer2": KVCacheTensor(100), + }, + kv_cache_groups=[ + KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), + KVCacheGroupSpec(["layer2"], + new_kv_cache_spec(num_kv_heads=4)), + ], + ), + KVCacheConfig( + num_blocks=20, + tensors={ + "layer1": KVCacheTensor(100), + "layer2": KVCacheTensor(100), + }, + kv_cache_groups=[ + KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), + KVCacheGroupSpec(["layer2"], + new_kv_cache_spec(num_kv_heads=4)), + ], + ), + ] + unify_kv_cache_configs(same_kv_cache_config) + assert same_kv_cache_config[0].num_blocks == 10 + assert same_kv_cache_config[1].num_blocks == 10 + + need_sort_kv_cache_config = [ + KVCacheConfig( + num_blocks=10, + tensors={ + "layer1": KVCacheTensor(100), + "layer2": KVCacheTensor(100), + }, + kv_cache_groups=[ + KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), + KVCacheGroupSpec(["layer2"], + new_kv_cache_spec(num_kv_heads=4)), + ], + ), + KVCacheConfig( + num_blocks=20, + tensors={ + "layer1": KVCacheTensor(100), + "layer2": KVCacheTensor(100), + }, + kv_cache_groups=[ + KVCacheGroupSpec(["layer2"], + new_kv_cache_spec(num_kv_heads=4)), + KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), + ], + ), + ] + + unify_kv_cache_configs(need_sort_kv_cache_config) + assert need_sort_kv_cache_config[0].num_blocks == 10 + assert need_sort_kv_cache_config[1].num_blocks == 10 + + diff_kv_cache_config = [ + KVCacheConfig( + num_blocks=10, + tensors={ + "layer1": KVCacheTensor(100), + "layer2": KVCacheTensor(100), + }, + kv_cache_groups=[ + KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), + KVCacheGroupSpec(["layer2"], + new_kv_cache_spec(num_kv_heads=4)), + ], + ), + KVCacheConfig( + num_blocks=20, + tensors={ + "layer1": KVCacheTensor(100), + "layer2": KVCacheTensor(100), + }, + kv_cache_groups=[ + KVCacheGroupSpec(["layer1"], new_kv_cache_spec()), + KVCacheGroupSpec(["layer2"], + new_kv_cache_spec(num_kv_heads=8)), + ], + ), + ] + with pytest.raises(AssertionError): + unify_kv_cache_configs(diff_kv_cache_config) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index adadcab5ea104..e0d7f4dbdc1c1 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -7,8 +7,8 @@ from typing import Any, NamedTuple, Optional from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec, - KVCacheTensor) +from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheGroupSpec, + KVCacheSpec, KVCacheTensor) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -449,7 +449,7 @@ def hash_request_tokens(block_size: int, def check_enough_kv_cache_memory(vllm_config: VllmConfig, - kv_cache_spec: KVCacheSpec, + kv_cache_spec: dict[str, KVCacheSpec], available_memory: int): """ Checks whether `available_memory` is enough for the KV cache to hold at @@ -457,7 +457,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig, Args: vllm_config: The global VllmConfig - kv_cache_spec: The kv cache spec of the model + kv_cache_spec: The kv cache spec of each attention layer in the model available_memory: Memory available for KV cache in bytes. Raises: @@ -484,12 +484,43 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig, f"`max_model_len` when initializing the engine.") -def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool: +def create_kv_cache_group_specs( + kv_cache_spec: dict[str, KVCacheSpec], + grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]: + """ + Create KVCacheGroupSpec object for each kv cache group layer. + The layers in the same group should share the same + KVCacheSpec. + + Args: + kv_cache_spec: + A mapping from each layer name to its corresponding KVCacheSpec. + grouped_layer_names: + A list of kv cache groups, where each element is a list of layer + names that belong to the same group and should share the same + KVCacheSpec. + Returns: + A list of KVCacheGroupSpec objects, one for each group. + """ + kv_cache_groups = [] + for layer_names_one_group in grouped_layer_names: + layer_spec = kv_cache_spec[layer_names_one_group[0]] + assert all( + kv_cache_spec[layer_name] == layer_spec + for layer_name in layer_names_one_group[1:]), ( + "All layers in the same KV cache group must share the same " + "KVCacheSpec.") + kv_cache_groups.append( + KVCacheGroupSpec(layer_names_one_group, layer_spec)) + return kv_cache_groups + + +def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: """ Whether all layers in the given KVCacheSpec have the same type of KV cache. Args: - kv_cache_spec: The KVCacheSpec of the model + kv_cache_spec: The kv cache spec of each attention layer in the model Returns: True if all layers have the same type, False otherwise. @@ -500,18 +531,16 @@ def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool: def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, - kv_cache_spec: KVCacheSpec, - available_memory: int, - num_layers: int) -> KVCacheConfig: + kv_cache_spec: dict[str, KVCacheSpec], + available_memory: int) -> KVCacheConfig: """ Generates the KV cache configuration for a model with one type of KV cache. Divide the available memory equally among all layers. Args: vllm_config: The global VllmConfig - kv_cache_spec: The kv cache spec of the model + kv_cache_spec: The kv cache spec of each attention layer in the model available_memory: Memory available for KV cache in bytes. - num_layers: The number of layers in the model. Returns: The generated KVCacheConfig @@ -521,7 +550,7 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, assert len(page_sizes) == 1 page_size = page_sizes.pop() - num_blocks = int(available_memory // page_size // num_layers) + num_blocks = int(available_memory // page_size // len(kv_cache_spec)) num_blocks = max(num_blocks, 0) if vllm_config.cache_config.num_gpu_blocks_override is not None: @@ -541,6 +570,9 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, max_model_len_str, max_concurrency) per_layer_size = page_size * num_blocks + # All layers have the same KV cache spec, so we create one kv cache group + # for all layers. + grouped_layer_names = [list(kv_cache_spec.keys())] kv_cache_config = KVCacheConfig( num_blocks=num_blocks, @@ -548,41 +580,69 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, layer_name: KVCacheTensor(size=per_layer_size) for layer_name in kv_cache_spec }, - groups=[[layer_name for layer_name in kv_cache_spec]], - kv_cache_spec=kv_cache_spec) + kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec, + grouped_layer_names), + ) return kv_cache_config -def get_kv_cache_configs(vllm_config: VllmConfig, - kv_cache_specs: list[KVCacheSpec], - available_memory: int) -> list[KVCacheConfig]: +def get_kv_cache_config(vllm_config: VllmConfig, + kv_cache_spec: dict[str, KVCacheSpec], + available_memory: int) -> KVCacheConfig: """ Generates the KV cache configuration for a model TODO: support hybrid models with more than one type of KV cache. Args: vllm_config: The global VllmConfig - kv_cache_specs: The kv cache specs of the model + kv_cache_spec: The kv cache spec of each attention layer in the model available_memory: Memory available for KV cache in bytes. Returns: The generated KVCacheConfigs """ - # Use the max number of layers to conservatively determine - # the number of blocks. - num_layers = max(len(kv_cache_spec) for kv_cache_spec in kv_cache_specs) - kv_cache_configs = [] - for kv_cache_spec in kv_cache_specs: - check_enough_kv_cache_memory(vllm_config, kv_cache_spec, - available_memory) - if is_kv_cache_type_uniform(kv_cache_spec): - # KV cache of all layers are the same, which is true for - # most models. Allocate the same amount of memory for - # each layer. - kv_cache_configs.append( - _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec, - available_memory, - num_layers)) - else: - raise NotImplementedError + check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory) + if is_kv_cache_type_uniform(kv_cache_spec): + # KV cache of all layers are the same, which is true for + # most models. Allocate the same amount of memory for + # each layer. + return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec, + available_memory) + + raise NotImplementedError + + +def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]): + """ + Make the KV cache configurations for each worker consistent, so that all + workers can be controlled by the same KVCacheManager. + This function verifies that the layer group of each worker are the same, + and changes the num_blocks of each worker to the smallest among all workers. + + Args: + kv_cache_configs: The KV cache configurations for each worker. Will be + in-place modified to make them consistent. + """ + + # Sort the kv cache groups by the type_id of their KV cache spec. + # This can avoid the inconsistency caused by the order of groups. + for kv_cache_config in kv_cache_configs: + kv_cache_config.kv_cache_groups.sort( + key=lambda x: x.kv_cache_spec.type_id) + + # Verify that the groups of each rank are the same. + for kv_cache_config in kv_cache_configs[1:]: + for group_rank_0, group_rank_i in zip( + kv_cache_configs[0].kv_cache_groups, + kv_cache_config.kv_cache_groups): + assert group_rank_0.kv_cache_spec == group_rank_i.kv_cache_spec + + # Change the num_blocks of each rank to the smallest among all ranks. We + # do not need to shrink the tensor size because it is valid to only use the + # first `num_blocks` blocks of the tensor. + min_num_blocks = min(kv_cache_config.num_blocks + for kv_cache_config in kv_cache_configs) + for kv_cache_config in kv_cache_configs: + kv_cache_config.num_blocks = min_num_blocks + return kv_cache_configs diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 1598e6b8443fe..f4bb4583bea48 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -21,7 +21,8 @@ from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname, zmq_socket_ctx) -from vllm.v1.core.kv_cache_utils import get_kv_cache_configs +from vllm.v1.core.kv_cache_utils import (get_kv_cache_config, + unify_kv_cache_configs) from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, @@ -120,15 +121,27 @@ class EngineCore: # memory can be allocated for kv cache. available_gpu_memory = self.model_executor.determine_available_memory() + assert len(kv_cache_specs) == len(available_gpu_memory) # Get the kv cache tensor size - kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs, - available_gpu_memory) - num_gpu_blocks_set = set(config.num_blocks - for config in kv_cache_configs) - assert len(num_gpu_blocks_set) == 1, ( - f"num_gpu_blocks need to be the same across workers, " - f"but they are different: {num_gpu_blocks_set}") - num_gpu_blocks = num_gpu_blocks_set.pop() + kv_cache_configs = [ + get_kv_cache_config(vllm_config, kv_cache_spec_one_worker, + available_gpu_memory_one_worker) + for kv_cache_spec_one_worker, available_gpu_memory_one_worker in + zip(kv_cache_specs, available_gpu_memory) + ] + + # Since we use a shared centralized controller, we need the + # `kv_cache_config` to be consistent across all workers to make sure + # all the memory operators can be applied to all workers. + unify_kv_cache_configs(kv_cache_configs) + + # All workers have the same kv_cache_config except layer names, so use + # an arbitrary one to get the number of blocks. + assert all([ + cfg.num_blocks == kv_cache_configs[0].num_blocks + for cfg in kv_cache_configs + ]) + num_gpu_blocks = kv_cache_configs[0].num_blocks num_cpu_blocks = 0 # Initialize kv cache and warmup the execution diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index aa6ae83c26ea7..e3a4cd98c1f81 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -62,14 +62,11 @@ class Executor(ExecutorBase): args=(kv_cache_configs, )) self.collective_rpc("compile_or_warm_up_model") - def determine_available_memory(self) -> int: # in bytes + def determine_available_memory(self) -> list[int]: # in bytes output = self.collective_rpc("determine_available_memory") - # Since we use a shared centralized controller, we take the minimum - # memory size across all workers to make sure all the memory - # operators can be applied to all workers. - return min(output) + return output - def get_kv_cache_specs(self) -> list[KVCacheSpec]: + def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]: output = self.collective_rpc("get_kv_cache_spec") return output @@ -95,7 +92,7 @@ class UniProcExecutor(UniProcExecutorV0, Executor): class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor): - def determine_available_memory(self) -> int: # in bytes + def determine_available_memory(self) -> list[int]: # in bytes # same as determine_num_available_blocks in v0, # we need to get the min across all ranks. memory = super().determine_available_memory() @@ -103,4 +100,4 @@ class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor): cpu_group = get_world_group().cpu_group memory_tensor = torch.tensor([memory], device="cpu", dtype=torch.int64) dist.all_reduce(memory_tensor, group=cpu_group, op=dist.ReduceOp.MIN) - return memory_tensor.item() + return [memory_tensor.item()] diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 1f885c10c8c3a..867b1b61c8799 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -11,7 +11,7 @@ logger = init_logger(__name__) @dataclass -class KVCacheSpecBase: +class KVCacheSpec: """ A base class for specifying the KV cache format of one layer. """ @@ -55,7 +55,7 @@ class KVCacheSpecBase: @dataclass -class FullAttentionSpec(KVCacheSpecBase): +class FullAttentionSpec(KVCacheSpec): num_kv_heads: int head_size: int dtype: torch.dtype @@ -76,9 +76,6 @@ class FullAttentionSpec(KVCacheSpecBase): return cdiv(num_tokens, self.block_size) * self.page_size_bytes -KVCacheSpec = dict[str, KVCacheSpecBase] - - @dataclass class KVCacheTensor: """ @@ -89,6 +86,18 @@ class KVCacheTensor: size: int # The size of KV cache Tensor in bytes +@dataclass +class KVCacheGroupSpec: + """ + Represents a group of model layers that share the same KV cache block table. + These layers are regarded as one layer in the KV cache manager. + """ + # The names of model layers in this group + layer_names: list[str] + # The KV cache spec of this manager layer + kv_cache_spec: KVCacheSpec + + @dataclass class KVCacheConfig: """ @@ -99,17 +108,24 @@ class KVCacheConfig: """layer_name -> how to initialize KV cache for that layer""" tensors: dict[str, KVCacheTensor] """ - A list of kv-cache groups. Each group includes a set of layers with - the same kv-cache spec, and the total page_size of layers inside a group - is same across all groups (as the KVCacheManager only supports allocating - pages of the same size). For example: - 1. A model only uses full attention: one group with all layers in the model. - 2. (not implemented yet) A model with the same number of full attention - layers and sliding window attention layers: two groups, one for full - attention layers and one for sliding window attention layers. - 3. (not implemented yet) A model with 2 full attention layers and 4 sliding - window attention layers: three groups, (full * 2), (sw * 2), (sw * 2). + The kv cache groups of the model. + The layers in the models are repeated with some patterns, e.g., a model + with 10 full attention layers and 20 sliding window attention layers can be + regarded as repeating the pattern (1 * full, 2 * sw) 10 times. + The KVCacheManager allocates different block tables for each of the 3 layers + in the pattern, and repeats each of them 10 times to generate the + block_table for the 30 layers in the model. + Therefore, we can group the layers in the model into 3 groups, each of which + contains 10 layers in the model. + The KVCacheManager allocates the block_table for each group based on its + kv_cache spec, and the model runner applies the block table to each layer + in the group. + For example: + 1. A model only uses full attention. The pattern is + (num_hidden_layers * full), so there is only one group and the block table + is shared by all layers. + 2. (WIP) A model with 10 full attention layers and 20 sliding window + attention layers. There are 3 layers in the pattern (1 * full, 2 * sw), so + there are 3 groups, each of which represents 10 layers in the model. """ - groups: list[list[str]] - """the KVCacheSpec of the model""" - kv_cache_spec: KVCacheSpec + kv_cache_groups: list[KVCacheGroupSpec] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b186300a00330..229849e4439b1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1510,34 +1510,46 @@ class GPUModelRunner(LoRAModelRunnerMixin): kv_cache_config: Configuration for the KV cache, including the KV cache size of each layer """ - if len(kv_cache_config.groups) > 1: + if len(kv_cache_config.kv_cache_groups) > 1: raise NotImplementedError( "Hybrid models with more than one KV cache type are not " "supported yet.") kv_caches: dict[str, torch.Tensor] = {} - for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items(): - tensor_config = kv_cache_config.tensors[layer_name] - assert tensor_config.size % layer_spec.page_size_bytes == 0 - num_blocks = tensor_config.size // layer_spec.page_size_bytes - if isinstance(layer_spec, FullAttentionSpec): - kv_cache_shape = self.attn_backend.get_kv_cache_shape( - num_blocks, layer_spec.block_size, layer_spec.num_kv_heads, - layer_spec.head_size) - dtype = layer_spec.dtype - kv_caches[layer_name] = torch.zeros(kv_cache_shape, - dtype=dtype, - device=self.device) - else: - raise NotImplementedError + for kv_cache_group in kv_cache_config.kv_cache_groups: + kv_cache_spec = kv_cache_group.kv_cache_spec + for layer_name in kv_cache_group.layer_names: + tensor_config = kv_cache_config.tensors[layer_name] + assert tensor_config.size % kv_cache_spec.page_size_bytes == 0 + num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes + # `num_blocks` is the number of blocks the model runner can use. + # `kv_cache_config.num_blocks` is the number of blocks that + # KVCacheManager may allocate. + # Since different GPUs may have different number of layers and + # different memory capacities, `num_blocks` can be different on + # different GPUs, and `kv_cache_config.num_blocks` is set to + # the min of all `num_blocks`. Verify it here. + assert num_blocks >= kv_cache_config.num_blocks + if isinstance(kv_cache_spec, FullAttentionSpec): + kv_cache_shape = self.attn_backend.get_kv_cache_shape( + num_blocks, kv_cache_spec.block_size, + kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) + dtype = kv_cache_spec.dtype + kv_caches[layer_name] = torch.zeros(kv_cache_shape, + dtype=dtype, + device=self.device) + else: + # TODO: add new branches when introducing more types of + # KV cache specs. + raise ValueError("Unknown KV cache spec type.") bind_kv_cache( kv_caches, self.vllm_config.compilation_config.static_forward_context, self.kv_caches) - def get_kv_cache_spec(self) -> KVCacheSpec: + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """ Generates the KVCacheSpec by parsing the kv cache format from each Attention module in the static forward context. @@ -1549,7 +1561,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): forward_ctx = self.vllm_config.compilation_config.static_forward_context block_size = self.vllm_config.cache_config.block_size use_mla = self.vllm_config.model_config.use_mla - kv_cache_spec: KVCacheSpec = {} + kv_cache_spec: dict[str, KVCacheSpec] = {} for layer_name, attn_module in forward_ctx.items(): if isinstance(attn_module, FusedMoE): continue diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index a63a2d022378e..51b9f56739665 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -185,7 +185,7 @@ class Worker(WorkerBase): return int(available_kv_cache_memory) - def get_kv_cache_spec(self) -> KVCacheSpec: + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: return self.model_runner.get_kv_cache_spec() def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index d772a3ee13ec3..f4502f6b42379 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -309,7 +309,7 @@ class TPUModelRunner: assert self.model is not None return self.model - def get_kv_cache_spec(self) -> KVCacheSpec: + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """ Generates the KVCacheSpec by parsing the kv cache format from each Attention module in the static forward context. @@ -320,7 +320,7 @@ class TPUModelRunner: forward_ctx = self.vllm_config.compilation_config.static_forward_context block_size = self.vllm_config.cache_config.block_size - kv_cache_spec: KVCacheSpec = {} + kv_cache_spec: dict[str, KVCacheSpec] = {} for layer_name, attn_module in forward_ctx.items(): # TODO: Support other attention modules, e.g., sliding window, # cross-attention, MLA. @@ -837,31 +837,33 @@ class TPUModelRunner: kv_cache_config: Configuration for the KV cache, including the KV cache size of each layer """ - if len(kv_cache_config.groups) > 1: + if len(kv_cache_config.kv_cache_groups) > 1: raise NotImplementedError( "Hybrid models with more than one KV cache type are not " "supported yet.") kv_caches: dict[str, torch.Tensor] = {} - for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items(): - tensor_config = kv_cache_config.tensors[layer_name] - assert tensor_config.size % layer_spec.page_size_bytes == 0 - num_blocks = tensor_config.size // layer_spec.page_size_bytes - if isinstance(layer_spec, FullAttentionSpec): - kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape( - num_blocks, layer_spec.block_size, layer_spec.num_kv_heads, - layer_spec.head_size) - dtype = layer_spec.dtype + for kv_cache_group in kv_cache_config.kv_cache_groups: + kv_cache_spec = kv_cache_group.kv_cache_spec + for layer_name in kv_cache_group.layer_names: + tensor_config = kv_cache_config.tensors[layer_name] + assert tensor_config.size % kv_cache_spec.page_size_bytes == 0 + num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes + if isinstance(kv_cache_spec, FullAttentionSpec): + kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape( + num_blocks, kv_cache_spec.block_size, + kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) + dtype = kv_cache_spec.dtype - tpu_k_cache = torch.zeros(kv_cache_shape, - dtype=dtype, - device=self.device) - tpu_v_cache = torch.zeros_like(tpu_k_cache) + tpu_k_cache = torch.zeros(kv_cache_shape, + dtype=dtype, + device=self.device) + tpu_v_cache = torch.zeros_like(tpu_k_cache) - kv_caches[layer_name] = (tpu_k_cache, tpu_v_cache) - else: - raise NotImplementedError + kv_caches[layer_name] = (tpu_k_cache, tpu_v_cache) + else: + raise NotImplementedError bind_kv_cache( kv_caches, diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index dbb231950d08d..d56c25dd9da24 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -189,7 +189,7 @@ class TPUWorker: def get_model(self) -> nn.Module: return self.model_runner.get_model() - def get_kv_cache_spec(self) -> KVCacheSpec: + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: return self.model_runner.get_kv_cache_spec() def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index 51d2da2344b81..487a49b6211e2 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -51,7 +51,7 @@ class WorkerBase(WorkerBaseV0): self.device: Optional[torch.device] = None self.model_runner: Optional[nn.Module] = None - def get_kv_cache_spec(self) -> KVCacheSpec: + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """Get specifications for KV cache implementation.""" raise NotImplementedError From c21b99b91241409c2fdf9f3f8c542e8748b317be Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Fri, 21 Mar 2025 20:14:36 +0800 Subject: [PATCH 27/40] [Bugfix][VLM] fix llava processor (#15285) Signed-off-by: Mengqing Cao --- vllm/model_executor/models/llava.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 441ccde046eb9..40accfffe4f9d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -233,7 +233,13 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]): class LlavaProcessingInfo(BaseLlavaProcessingInfo): def get_hf_processor(self, **kwargs: object): - return self.ctx.get_hf_processor(LlavaProcessor, **kwargs) + hf_processor = self.ctx.get_hf_processor(LlavaProcessor, **kwargs) + # In case patch_size is omitted from `processor_config.json` + # e.g. for E5-V: https://huggingface.co/royokong/e5-v + if hf_processor.patch_size is None: + patch_size = self.get_vision_encoder_info().get_patch_size() + hf_processor.patch_size = patch_size + return hf_processor class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]): From baec0d4de9d202a4d91b8df0c3f1c53a4ddb0a09 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 21 Mar 2025 23:30:23 +0800 Subject: [PATCH 28/40] Revert "[Feature] specify model in config.yaml (#14855)" (#15293) Signed-off-by: DarkLight1337 --- .../serving/openai_compatible_server.md | 4 +- tests/config/test_config_with_model.yaml | 7 --- tests/conftest.py | 12 ----- tests/{config => data}/test_config.yaml | 0 tests/test_utils.py | 53 ++++--------------- vllm/entrypoints/cli/serve.py | 22 ++++---- vllm/utils.py | 34 ++++-------- 7 files changed, 30 insertions(+), 102 deletions(-) delete mode 100644 tests/config/test_config_with_model.yaml rename tests/{config => data}/test_config.yaml (100%) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 378405d3690f2..a6ec05f45b69b 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -184,7 +184,6 @@ For example: ```yaml # config.yaml -model: meta-llama/Llama-3.1-8B-Instruct host: "127.0.0.1" port: 6379 uvicorn-log-level: "info" @@ -193,13 +192,12 @@ uvicorn-log-level: "info" To use the above config file: ```bash -vllm serve --config config.yaml +vllm serve SOME_MODEL --config config.yaml ``` :::{note} In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence. The order of priorities is `command line > config file values > defaults`. -e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file. ::: ## API Reference diff --git a/tests/config/test_config_with_model.yaml b/tests/config/test_config_with_model.yaml deleted file mode 100644 index d8c8c7bc8162a..0000000000000 --- a/tests/config/test_config_with_model.yaml +++ /dev/null @@ -1,7 +0,0 @@ -# Same as test_config.yaml but with model specified -model: config-model -port: 12312 -served_model_name: mymodel -tensor_parallel_size: 2 -trust_remote_code: true -multi_step_stream_outputs: false diff --git a/tests/conftest.py b/tests/conftest.py index 8c6046c5817a1..0c71d98101647 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1121,15 +1121,3 @@ def pytest_collection_modifyitems(config, items): for item in items: if "optional" in item.keywords: item.add_marker(skip_optional) - - -@pytest.fixture(scope="session") -def cli_config_file(): - """Return the path to the CLI config file.""" - return os.path.join(_TEST_DIR, "config", "test_config.yaml") - - -@pytest.fixture(scope="session") -def cli_config_file_with_model(): - """Return the path to the CLI config file with model.""" - return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml") diff --git a/tests/config/test_config.yaml b/tests/data/test_config.yaml similarity index 100% rename from tests/config/test_config.yaml rename to tests/data/test_config.yaml diff --git a/tests/test_utils.py b/tests/test_utils.py index 1c33f83c59366..3660cfa0e49e2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,7 +8,7 @@ from unittest.mock import patch import pytest import torch -from vllm_test_utils.monitor import monitor +from vllm_test_utils import monitor from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.utils import (FlexibleArgumentParser, MemorySnapshot, @@ -140,8 +140,7 @@ def parser(): def parser_with_config(): parser = FlexibleArgumentParser() parser.add_argument('serve') - parser.add_argument('model_tag', nargs='?') - parser.add_argument('--model', type=str) + parser.add_argument('model_tag') parser.add_argument('--served-model-name', type=str) parser.add_argument('--config', type=str) parser.add_argument('--port', type=int) @@ -197,29 +196,29 @@ def test_missing_required_argument(parser): parser.parse_args([]) -def test_cli_override_to_config(parser_with_config, cli_config_file): +def test_cli_override_to_config(parser_with_config): args = parser_with_config.parse_args([ - 'serve', 'mymodel', '--config', cli_config_file, + 'serve', 'mymodel', '--config', './data/test_config.yaml', '--tensor-parallel-size', '3' ]) assert args.tensor_parallel_size == 3 args = parser_with_config.parse_args([ 'serve', 'mymodel', '--tensor-parallel-size', '3', '--config', - cli_config_file + './data/test_config.yaml' ]) assert args.tensor_parallel_size == 3 assert args.port == 12312 args = parser_with_config.parse_args([ 'serve', 'mymodel', '--tensor-parallel-size', '3', '--config', - cli_config_file, '--port', '666' + './data/test_config.yaml', '--port', '666' ]) assert args.tensor_parallel_size == 3 assert args.port == 666 -def test_config_args(parser_with_config, cli_config_file): +def test_config_args(parser_with_config): args = parser_with_config.parse_args( - ['serve', 'mymodel', '--config', cli_config_file]) + ['serve', 'mymodel', '--config', './data/test_config.yaml']) assert args.tensor_parallel_size == 2 assert args.trust_remote_code assert not args.multi_step_stream_outputs @@ -241,9 +240,10 @@ def test_config_file(parser_with_config): ]) -def test_no_model_tag(parser_with_config, cli_config_file): +def test_no_model_tag(parser_with_config): with pytest.raises(ValueError): - parser_with_config.parse_args(['serve', '--config', cli_config_file]) + parser_with_config.parse_args( + ['serve', '--config', './data/test_config.yaml']) # yapf: enable @@ -476,34 +476,3 @@ def test_swap_dict_values(obj, key1, key2): assert obj[key1] == original_obj[key2] else: assert key1 not in obj - - -def test_model_specification(parser_with_config, - cli_config_file, - cli_config_file_with_model): - # Test model in CLI takes precedence over config - args = parser_with_config.parse_args([ - 'serve', 'cli-model', '--config', cli_config_file_with_model - ]) - assert args.model_tag == 'cli-model' - assert args.served_model_name == 'mymodel' - - # Test model from config file works - args = parser_with_config.parse_args([ - 'serve', '--config', cli_config_file_with_model - ]) - assert args.model == 'config-model' - assert args.served_model_name == 'mymodel' - - # Test no model specified anywhere raises error - with pytest.raises(ValueError, match="No model specified!"): - parser_with_config.parse_args(['serve', '--config', cli_config_file]) - - # Test other config values are preserved - args = parser_with_config.parse_args([ - 'serve', 'cli-model', '--config', cli_config_file_with_model - ]) - assert args.tensor_parallel_size == 2 - assert args.trust_remote_code is True - assert args.multi_step_stream_outputs is False - assert args.port == 12312 diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index cf05eb09b37ad..c345ece4dada9 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -21,16 +21,14 @@ class ServeSubcommand(CLISubcommand): @staticmethod def cmd(args: argparse.Namespace) -> None: - # If model is specified in CLI (as positional arg), it takes precedence - if hasattr(args, 'model_tag') and args.model_tag is not None: - args.model = args.model_tag - # Otherwise use model from config (already in args.model) - - # Check if we have a model specified somewhere - if args.model == EngineArgs.model: # Still has default value + # The default value of `--model` + if args.model != EngineArgs.model: raise ValueError( - "With `vllm serve`, you should provide the model either as a " - "positional argument or in config file.") + "With `vllm serve`, you should provide the model as a " + "positional argument instead of via the `--model` option.") + + # EngineArgs expects the model name to be passed as --model. + args.model = args.model_tag uvloop.run(run_server(args)) @@ -43,12 +41,10 @@ class ServeSubcommand(CLISubcommand): serve_parser = subparsers.add_parser( "serve", help="Start the vLLM OpenAI Compatible API server", - usage="vllm serve [model_tag] [options]") + usage="vllm serve [options]") serve_parser.add_argument("model_tag", type=str, - nargs='?', - help="The model tag to serve " - "(optional if specified in config)") + help="The model tag to serve") serve_parser.add_argument( "--config", type=str, diff --git a/vllm/utils.py b/vllm/utils.py index 64d9faeb1cb3d..9e09f0b9f2d94 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1265,29 +1265,19 @@ class FlexibleArgumentParser(argparse.ArgumentParser): config_args = self._load_config_file(file_path) # 0th index is for {serve,chat,complete} - # optionally followed by model_tag (only for serve) + # followed by model_tag (only for serve) # followed by config args # followed by rest of cli args. # maintaining this order will enforce the precedence # of cli > config > defaults if args[0] == "serve": - model_in_cli = len(args) > 1 and not args[1].startswith('-') - model_in_config = any(arg == '--model' for arg in config_args) - - if not model_in_cli and not model_in_config: + if index == 1: raise ValueError( - "No model specified! Please specify model either in " - "command-line arguments or in config file.") - - if model_in_cli: - # Model specified as positional arg, keep CLI version - args = [args[0]] + [ - args[1] - ] + config_args + args[2:index] + args[index + 2:] - else: - # No model in CLI, use config if available - args = [args[0] - ] + config_args + args[1:index] + args[index + 2:] + "No model_tag specified! Please check your command-line" + " arguments.") + args = [args[0]] + [ + args[1] + ] + config_args + args[2:index] + args[index + 2:] else: args = [args[0]] + config_args + args[1:index] + args[index + 2:] @@ -1305,7 +1295,9 @@ class FlexibleArgumentParser(argparse.ArgumentParser): '--port': '12323', '--tensor-parallel-size': '4' ] + """ + extension: str = file_path.split('.')[-1] if extension not in ('yaml', 'yml'): raise ValueError( @@ -1330,15 +1322,7 @@ class FlexibleArgumentParser(argparse.ArgumentParser): if isinstance(action, StoreBoolean) ] - # Skip model from config if it's provided as positional argument - skip_model = (hasattr(self, '_parsed_args') and self._parsed_args - and len(self._parsed_args) > 1 - and self._parsed_args[0] == 'serve' - and not self._parsed_args[1].startswith('-')) - for key, value in config.items(): - if skip_model and key == 'model': - continue if isinstance(value, bool) and key not in store_boolean_arguments: if value: processed_args.append('--' + key) From cfbb8c930fcda6d97f0de3018bd3c51ab14b367c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Fri, 21 Mar 2025 16:50:39 +0100 Subject: [PATCH 29/40] [TPU][V1] MHA Pallas backend (#15288) Signed-off-by: NickLucche --- tests/v1/tpu/test_mha_attn.py | 109 ++++++++++++++++++++++++++++++++++ vllm/attention/layer.py | 10 +++- 2 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 tests/v1/tpu/test_mha_attn.py diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py new file mode 100644 index 0000000000000..01664598ccfde --- /dev/null +++ b/tests/v1/tpu/test_mha_attn.py @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Test: + +* Tests for MultiHeadAttention layer +""" + +import pytest +import torch +import torch_xla +import torch_xla.core +import torch_xla.core.xla_model + +from vllm import envs +from vllm.attention.layer import MultiHeadAttention +from vllm.attention.selector import _cached_get_attn_backend +from vllm.platforms import current_platform + +if not envs.VLLM_USE_V1: + pytest.skip( + "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.", + allow_module_level=True, + ) + + +@pytest.fixture(autouse=True) +def clear_cache(): + """Clear lru cache to ensure each test case runs without caching. + """ + _cached_get_attn_backend.cache_clear() + + +def ref_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + scale: float, +) -> torch.Tensor: + """ + Native implementation of scaled dot product attention without mask: + - query, key, value: [batch_size, seq_len, num_heads, head_size] + - attn_mask: [batch_size, seq_len, seq_len] + """ + query, key, value = (x.transpose(1, 2) for x in (query, key, value)) + attn_weights = scale * torch.matmul(query, key.transpose(2, 3)) + attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) + out = torch.matmul(attn_weights, value).transpose(1, 2) + return out + + +BATCH_SIZES = [1, 16] +SEQ_LENS = [1] +NUM_HEADS = [1, 16] +NUM_KV_HEADS = [1] +HEAD_SIZES = [64, 80] + + +@pytest.mark.skipif(not current_platform.is_tpu(), + reason="This test needs a TPU") +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("device", [torch_xla.core.xla_model.xla_device()]) +def test_mha_attn_forward( + batch_size: int, + seq_len: int, + num_heads: int, + num_kv_heads: int, + head_size: int, + device: str, +): + current_platform.seed_everything(0) + # These are expected to be f32 + q = torch.randn(batch_size, seq_len, num_heads * head_size, device=device) + k = torch.randn(batch_size, + seq_len, + num_kv_heads * head_size, + device=device) + v = torch.randn(batch_size, + seq_len, + num_kv_heads * head_size, + device=device) + scale = 1.0 / head_size**0.5 + attn = MultiHeadAttention(num_heads, + head_size, + scale=scale, + num_kv_heads=num_kv_heads) + output = attn(q, k, v) + + assert num_heads % num_kv_heads == 0 + num_queries_per_kv = num_heads // num_kv_heads + + q = q.reshape(batch_size, seq_len, num_heads, head_size) + k = k.reshape(batch_size, seq_len, num_kv_heads, head_size) + v = v.reshape(batch_size, seq_len, num_kv_heads, head_size) + if num_queries_per_kv > 1: + k = torch.repeat_interleave(k, num_queries_per_kv, dim=2) + v = torch.repeat_interleave(v, num_queries_per_kv, dim=2) + + ref_output = ref_attention( + q, + k, + v, + scale=scale, + ).reshape(batch_size, seq_len, num_heads * head_size) + # torch_xla flash_attn kernel is less accurate but much faster + torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-3) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 946c07d508a38..dbf4723ee1bd7 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -281,8 +281,7 @@ class MultiHeadAttention(nn.Module): backend = _Backend.XFORMERS self.attn_backend = backend if backend in { - _Backend.TORCH_SDPA, - _Backend.XFORMERS, + _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1 } else _Backend.TORCH_SDPA def forward( @@ -320,6 +319,13 @@ class MultiHeadAttention(nn.Module): value, scale=self.scale) out = out.transpose(1, 2) + elif self.attn_backend == _Backend.PALLAS_VLLM_V1: + query, key, value = (x.transpose(1, 2) + for x in (query, key, value)) + from torch_xla.experimental.custom_kernel import flash_attention + out = flash_attention(query, key, value, sm_scale=self.scale) + out = out.transpose(1, 2) + return out.reshape(bsz, q_len, -1) From 790b79750b596043036b9fcbee885827fdd2ef3d Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 21 Mar 2025 18:28:46 -0400 Subject: [PATCH 30/40] [Build/CI] Fix env var typo (#15305) Signed-off-by: Russell Bryant --- .../nightly-benchmarks/scripts/run-performance-benchmarks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index a3555f72a6665..4cd449b141ece 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -361,7 +361,7 @@ main() { # get the current IP address, required by benchmark_serving.py export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') # turn of the reporting of the status of each request, to clean up the terminal output - export VLLM_LOG_LEVEL="WARNING" + export VLLM_LOGGING_LEVEL="WARNING" # prepare for benchmarking cd benchmarks || exit 1 From 4c69e228b32220ac9159dfdcf0df13ea776e630d Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Fri, 21 Mar 2025 22:25:43 -0700 Subject: [PATCH 31/40] [Misc] Increase RayDistributedExecutor RAY_CGRAPH_get_timeout (#15301) Signed-off-by: Rui Qiao --- vllm/executor/ray_distributed_executor.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index d769d235020d5..c823ab5bf9698 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -561,6 +561,15 @@ class RayDistributedExecutor(DistributedExecutorBase): envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL) logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s", envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM) + # Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds + # (it is 10 seconds by default). This is a Ray environment variable to + # control the timeout of getting result from a compiled graph execution, + # i.e., the distributed execution that includes model forward runs and + # intermediate tensor communications, in the case of vllm. + os.environ.setdefault("RAY_CGRAPH_get_timeout", "300") # noqa: SIM112 + logger.info("RAY_CGRAPH_get_timeout is set to %s", + os.environ["RAY_CGRAPH_get_timeout"]) # noqa: SIM112 + with InputNode() as input_data: # Example DAG: PP=2, TP=4 # From df1430265c0bda5fe02c43005352cce7a8aa9562 Mon Sep 17 00:00:00 2001 From: Andy Lo Date: Sat, 22 Mar 2025 05:35:37 +0000 Subject: [PATCH 32/40] [Bugfix][V0] Multi-sequence logprobs streaming edge case (#15259) Signed-off-by: Andy Lo --- vllm/outputs.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/outputs.py b/vllm/outputs.py index 7a20c340edcf7..014e8d5d88238 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -223,7 +223,12 @@ class RequestOutput: if delta: # Slice logprobs delta if applicable if output_logprobs: - output_logprobs = output_logprobs[-num_output_tokens:] + # num_output_tokens can be 0 when n > 1 and request finishes + # before the others + if num_output_tokens > 0: + output_logprobs = output_logprobs[-num_output_tokens:] + else: + output_logprobs = None # Don't include prompt if this is after the first output # containing decode token ids if include_prompt and seq.get_output_len() > num_output_tokens: From ec870fba9a59b8287fa205e4c35def4d3d153080 Mon Sep 17 00:00:00 2001 From: TJian Date: Sat, 22 Mar 2025 13:36:14 +0800 Subject: [PATCH 33/40] [FEAT] [ROCm]: Add AITER RMS Norm (Layer Norm) Feature (#14959) Signed-off-by: tjtanaa --- Dockerfile.rocm_base | 16 +++- .../model_executor/test_enabled_custom_ops.py | 29 +++++- .../decoder_only/language/test_models.py | 50 ++++++++-- vllm/envs.py | 13 +++ vllm/model_executor/layers/layernorm.py | 94 +++++++++++++++---- 5 files changed, 173 insertions(+), 29 deletions(-) diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base index e33e73b303098..38d6a33636eba 100644 --- a/Dockerfile.rocm_base +++ b/Dockerfile.rocm_base @@ -12,6 +12,8 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG FA_BRANCH="b7d29fb" ARG FA_REPO="https://github.com/ROCm/flash-attention.git" +ARG AITER_BRANCH="21d47a9" +ARG AITER_REPO="https://github.com/ROCm/aiter.git" FROM ${BASE_IMAGE} AS base @@ -129,8 +131,18 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ pip install /install/*.whl +ARG AITER_REPO +ARG AITER_BRANCH +RUN git clone --recursive ${AITER_REPO} +RUN cd aiter \ + && git checkout ${AITER_BRANCH} \ + && git submodule update --init --recursive \ + && pip install -r requirements.txt \ + && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter + ARG BASE_IMAGE ARG HIPBLASLT_BRANCH +ARG HIPBLAS_COMMON_BRANCH ARG LEGACY_HIPBLASLT_OPTION ARG RCCL_BRANCH ARG RCCL_REPO @@ -155,4 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ - && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt + && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ + && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ + && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index 4a6a766b8ca0b..24147b741278b 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -7,7 +7,10 @@ from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import (GeluAndMul, ReLUSquaredActivation, SiluAndMul) -from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.layernorm import ( + RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm, + rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm) +from vllm.platforms import current_platform # Registered subclass for test @@ -87,3 +90,27 @@ def test_enabled_ops_invalid(env: str): custom_ops=env.split(","))) with set_current_vllm_config(vllm_config): RMSNorm(1024).enabled() + + +@pytest.mark.parametrize("add_residual", [True, False]) +@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"]) +@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"]) +@pytest.mark.skipif(not current_platform.is_rocm(), + reason="AITER is a feature exclusive for ROCm") +def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str, + use_rocm_aiter_norm: str, monkeypatch): + monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter) + monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm) + rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual) + + if not add_residual: + if current_platform.is_rocm() and int(use_rocm_aiter) and int( + use_rocm_aiter_norm): + assert rms_norm_func == rocm_aiter_rms_norm + else: + assert rms_norm_func == rms_norm + elif current_platform.is_rocm() and int(use_rocm_aiter) and int( + use_rocm_aiter_norm): + assert rms_norm_func == rocm_aiter_fused_add_rms_norm + else: + assert rms_norm_func == fused_add_rms_norm diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index a49926ea220e8..79fa3fa997738 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -3,7 +3,11 @@ Run `pytest tests/models/test_models.py`. """ + import pytest +import torch + +from vllm.platforms import current_platform from ...utils import check_logprobs_close @@ -13,7 +17,21 @@ from ...utils import check_logprobs_close # https://github.com/vllm-project/vllm/issues/14524 REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"] +# This list contains the model that are using AITER kernel. +# Skip model that are not using AITER tests. +# When more AITER kernels are added, this list will not be +# needed as all the models will be calling AITER kernels +# in parts of the operators +AITER_MODEL_LIST = [ + "meta-llama/Llama-3.2-1B-Instruct", + "openbmb/MiniCPM3-4B", + "Qwen/Qwen-7B", + "Qwen/Qwen2.5-0.5B-Instruct", + "ehristoforu/Falcon3-MoE-2x7B-Insruct", +] + +# @maybe_test_rocm_aiter @pytest.mark.parametrize( "model", [ @@ -69,19 +87,24 @@ REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"] @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, - monkeypatch, -) -> None: +@pytest.mark.parametrize( + "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]) +def test_models(hf_runner, vllm_runner, example_prompts, model: str, + dtype: str, max_tokens: int, num_logprobs: int, + use_rocm_aiter: bool, monkeypatch) -> None: + if model in REQUIRES_V0: monkeypatch.setenv("VLLM_USE_V1", "0") + if use_rocm_aiter and (model in AITER_MODEL_LIST): + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") + elif use_rocm_aiter and model not in AITER_MODEL_LIST: + # Skip model that are not using AITER tests. + # When more AITER kernels are added, this list will not be + # needed as all the models will be calling AITER kernels + # in parts of the operators + pytest.skip(f"Skipping '{model}' model test with AITER kernel.") + with hf_runner(model, dtype=dtype) as hf_model: if model.startswith("THUDM/chatglm3"): hf_model.model.get_output_embeddings = lambda: \ @@ -100,3 +123,10 @@ def test_models( name_0="hf", name_1="vllm", ) + if use_rocm_aiter: + # this is to ensure that vllm engine + # has deallocated the memory before running the next + # unit tests. On ROCm, when using AITER + # the memory might not be deallocated completely + # before running the next test case + torch.cuda.synchronize() diff --git a/vllm/envs.py b/vllm/envs.py index d54de9da25315..7c07940c26c26 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -75,6 +75,8 @@ if TYPE_CHECKING: VLLM_SKIP_P2P_CHECK: bool = False VLLM_DISABLED_KERNELS: list[str] = [] VLLM_USE_V1: bool = True + VLLM_ROCM_USE_AITER: bool = False + VLLM_ROCM_USE_AITER_RMSNORM: bool = True VLLM_ROCM_FP8_PADDING: bool = True VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 @@ -528,6 +530,17 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))), + # Disable aiter ops unless specifically enabled. + # Acts as a parent switch to enable the rest of the other operations. + "VLLM_ROCM_USE_AITER": + lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in + ("true", "1")), + + # use aiter rms norm op if aiter ops are enabled. + "VLLM_ROCM_USE_AITER_RMSNORM": + lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in + ("true", "1")), + # Pad the fp8 weights to 256 bytes for ROCm "VLLM_ROCM_FP8_PADDING": lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))), diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index b476fb0dbc7eb..76d3acb92fb81 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -5,7 +5,77 @@ from typing import Optional, Tuple, Union import torch import torch.nn as nn +import vllm.envs as envs from vllm.model_executor.custom_op import CustomOp +from vllm.platforms import current_platform + + +def is_rocm_aiter_rmsnorm_enabled() -> bool: + return current_platform.is_rocm() \ + and envs.VLLM_ROCM_USE_AITER_RMSNORM \ + and envs.VLLM_ROCM_USE_AITER + + +def rms_norm(x: torch.Tensor, weight: torch.Tensor, + variance_epsilon: float) -> torch.Tensor: + from vllm import _custom_ops as ops + out = torch.empty_like(x) + ops.rms_norm( + out, + x, + weight, + variance_epsilon, + ) + return out + + +def fused_add_rms_norm( + x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, + variance_epsilon: float) -> Tuple[torch.Tensor, torch.Tensor]: + from vllm import _custom_ops as ops + ops.fused_add_rms_norm( + x, + residual, + weight, + variance_epsilon, + ) + return x, residual + + +def rocm_aiter_rms_norm(x: torch.Tensor, weight: torch.Tensor, + variance_epsilon: float) -> torch.Tensor: + + import aiter as rocm_aiter + return rocm_aiter.rms_norm(x, weight, variance_epsilon) + + +def rocm_aiter_fused_add_rms_norm( + x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, + variance_epsilon: float) -> Tuple[torch.Tensor, torch.Tensor]: + + import aiter as rocm_aiter + + # Assuming the correct signature for rmsnorm2d_fwd_with_add + rocm_aiter.rmsnorm2d_fwd_with_add( + x, # output + x, # input + residual, # residual input + residual, # residual output + weight, + variance_epsilon, + ) + return x, residual + + +def dispatch_cuda_rmsnorm_func(add_residual: bool): + if add_residual: + if is_rocm_aiter_rmsnorm_enabled(): + return rocm_aiter_fused_add_rms_norm + return fused_add_rms_norm + + if is_rocm_aiter_rmsnorm_enabled(): + return rocm_aiter_rms_norm + return rms_norm @CustomOp.register("rms_norm") @@ -81,24 +151,14 @@ class RMSNorm(CustomOp): if self.variance_size_override is not None: return self.forward_native(x, residual) - from vllm import _custom_ops as ops + add_residual = residual is not None + norm_func = dispatch_cuda_rmsnorm_func(add_residual) - if residual is not None: - ops.fused_add_rms_norm( - x, - residual, - self.weight.data, - self.variance_epsilon, - ) - return x, residual - out = torch.empty_like(x) - ops.rms_norm( - out, - x, - self.weight.data, - self.variance_epsilon, - ) - return out + if add_residual: + return norm_func(x, residual, self.weight.data, + self.variance_epsilon) + else: + return norm_func(x, self.weight.data, self.variance_epsilon) def forward_hpu( self, From 1c2bec0f8215f57ae3fd40286d944b7201948430 Mon Sep 17 00:00:00 2001 From: wwl2755 Date: Sat, 22 Mar 2025 00:36:43 -0500 Subject: [PATCH 34/40] [Doc] add load_format items in docs (#14804) Signed-off-by: wwl2755 --- vllm/config.py | 6 ++++++ vllm/engine/arg_utils.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 1f7147f7cfd41..181fa803c620b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1294,6 +1294,12 @@ class LoadConfig: "tensorizer" will use CoreWeave's tensorizer library for fast weight loading. "bitsandbytes" will load nf4 type weights. + "sharded_state" will load weights from pre-sharded checkpoint files, + supporting efficient loading of tensor-parallel models. + "gguf" will load weights from GGUF format files. + "mistral" will load weights from consolidated safetensors files used + by Mistral models. + "runai_streamer" will load weights from RunAI streamer format files. model_loader_extra_config: The extra config for the model loader. ignore_patterns: The list of patterns to ignore when loading the model. Default to "original/**/*" to avoid repeated loading of llama's diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index edfa748b82d7b..e396e68f823d9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -339,9 +339,15 @@ class EngineArgs: 'CoreWeave. See the Tensorize vLLM Model script in the Examples ' 'section for more information.\n' '* "runai_streamer" will load the Safetensors weights using Run:ai' - 'Model Streamer \n' + 'Model Streamer.\n' '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') + 'quantization.\n' + '* "sharded_state" will load weights from pre-sharded checkpoint ' + 'files, supporting efficient loading of tensor-parallel models\n' + '* "gguf" will load weights from GGUF format files (details ' + 'specified in https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n' + '* "mistral" will load weights from consolidated safetensors files ' + 'used by Mistral models.\n') parser.add_argument( '--config-format', default=EngineArgs.config_format, From 2fa0e1396bf02c819f0b89762b8ca395eff4c757 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 22 Mar 2025 13:49:34 +0800 Subject: [PATCH 35/40] [Bugfix] Fix torch.compile raise FileNotFoundError (#15278) Signed-off-by: Jee Jee Li --- vllm/compilation/backends.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index cdae42fe4fce8..089d415ab5fb8 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -399,6 +399,7 @@ class VllmBackend: rank = vllm_config.parallel_config.rank dp_rank = vllm_config.parallel_config.data_parallel_rank local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}") + os.makedirs(local_cache_dir, exist_ok=True) self.compilation_config.local_cache_dir = local_cache_dir disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE From 8a8b30eac1ab746c48b5a1e3eaa4e91941177bf8 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Sat, 22 Mar 2025 05:03:32 -0400 Subject: [PATCH 36/40] [Bugfix] LoRA V0 - Fix case where `max_num_seqs` is between cudagraph capture sizes (#15308) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- tests/lora/test_llama_tp.py | 14 ++++++++------ vllm/lora/punica_wrapper/punica_gpu.py | 10 +++++++++- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 0acdaeac6952d..7026f705026fb 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -84,12 +84,14 @@ def v1(run_with_both_engines_lora): @create_new_process_for_each_test() def test_llama_lora(sql_lora_files): - llm = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=1, - enable_chunked_prefill=True) + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + # also test odd max_num_seqs + max_num_seqs=13, + max_loras=4, + tensor_parallel_size=1, + enable_chunked_prefill=True) generate_and_test(llm, sql_lora_files) diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index eb6f5b1b488c4..be9cbe244a819 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, List, Optional, Tuple, Union, final import torch +import vllm.envs as envs from vllm.lora.layers import LoRAMapping from vllm.triton_utils import HAS_TRITON @@ -42,8 +43,15 @@ class PunicaWrapperGPU(PunicaWrapperBase): self.token_mapping_meta = LoRAKernelMeta.make(self.max_loras, max_num_batched_tokens, device=device) + + # When cudagraph capture size is greater than max_num_seqs (max_batches, + # here), V0 captures the graph as if max_num_seqs is set to + # the capture size. + # V1 doesn't have this problem and always respects max_num_seqs. + max_num_prompts = (max_batches + if envs.VLLM_USE_V1 else max_num_batched_tokens) self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras, - max_batches, + max_num_prompts, device=device) def update_metadata( From 2f4bd358f1d39846a38e8f61d11edfa446617dcd Mon Sep 17 00:00:00 2001 From: Naitong Yu Date: Sat, 22 Mar 2025 17:04:44 +0800 Subject: [PATCH 37/40] [Model] Support Tele-FLM Model (#15023) Signed-off-by: Naitong Yu Signed-off-by: jiangxin Co-authored-by: Jason Fang Co-authored-by: jiangxin --- docs/source/models/supported_models.md | 5 ++ examples/template_teleflm.jinja | 12 ++++ tests/models/registry.py | 2 + vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/teleflm.py | 79 ++++++++++++++++++++++++++ 5 files changed, 99 insertions(+) create mode 100644 examples/template_teleflm.jinja create mode 100644 vllm/model_executor/models/teleflm.py diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 5e5e7287f39eb..ba01f2309b3ca 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -472,6 +472,11 @@ See [this page](#generative-models) for more information on how to use generativ * `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. * ✅︎ * ✅︎ +- * `TeleFLMForCausalLM` + * TeleFLM + * `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. + * ✅︎ + * ✅︎ - * `XverseForCausalLM` * XVERSE * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. diff --git a/examples/template_teleflm.jinja b/examples/template_teleflm.jinja new file mode 100644 index 0000000000000..0cb29ccbb8410 --- /dev/null +++ b/examples/template_teleflm.jinja @@ -0,0 +1,12 @@ +{%- for message in messages %} + {%- if message['role'] == 'user' %} + {{- '<_user>' + message['content']|trim }} + {%- elif message['role'] == 'system' %} + {{- '<_system>' + message['content']|trim }} + {%- elif message['role'] == 'assistant' %} + {{- '<_bot>' + message['content'] }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<_bot>' }} +{%- endif %} diff --git a/tests/models/registry.py b/tests/models/registry.py index 554e28863a7b5..5c84e85aaa907 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -192,6 +192,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"), "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B", trust_remote_code=True), + "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407", + trust_remote_code=True), "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat", is_available_online=False, trust_remote_code=True), diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 74ae06c55d88e..7c8e506713833 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -104,6 +104,7 @@ _TEXT_GENERATION_MODELS = { "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"), "SolarForCausalLM": ("solar", "SolarForCausalLM"), "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), + "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"), "XverseForCausalLM": ("llama", "LlamaForCausalLM"), "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"), # [Encoder-decoder] diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py new file mode 100644 index 0000000000000..e670b1df08f70 --- /dev/null +++ b/vllm/model_executor/models/teleflm.py @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Type + +import torch + +from vllm.config import VllmConfig +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.models.llama import (LlamaDecoderLayer, + LlamaForCausalLM, LlamaModel) + + +class TeleFLMModel(LlamaModel): + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer, + ): + super().__init__(vllm_config=vllm_config, + prefix=prefix, + layer_type=layer_type) + """ + This implementation is based on the µScaling paper presented at + the ICLR 2025 Workshop: + NanoLM: An Affordable LLM Study Benchmark \ + via Accurate Loss Prediction across Scales + by Yiqun Yao et al. + Available at: https://openreview.net/forum?id=IwaPYg1SCA + arXiv preprint: https://arxiv.org/abs/2304.06875 + """ + self.use_mup = self.config.use_mup + if self.use_mup: + self.input_mult = self.config.input_mult + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + embedding = self.embed_tokens(input_ids) + if self.use_mup: + embedding = embedding * self.input_mult + return embedding + + +class TeleFLMForCausalLM(LlamaForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + # mup + self.use_mup = self.config.use_mup + if self.use_mup: + self.mup_scale_factor = self.config.mup_scale_factor + self.output_mult = self.config.output_mult / self.mup_scale_factor + logit_scale = self.output_mult + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + self.config.vocab_size, + logit_scale) From eb63ea1e185846b4d02333b61f73a02fe60a242e Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sat, 22 Mar 2025 11:56:17 -0400 Subject: [PATCH 38/40] [V1] Add `disable-any-whitespace` option support for xgrammar (#15316) Signed-off-by: Russell Bryant --- .../llm/test_struct_output_generate.py | 45 ++++++++++++++++++- vllm/engine/arg_utils.py | 4 +- vllm/v1/engine/processor.py | 2 +- vllm/v1/structured_output/backend_xgrammar.py | 7 ++- 4 files changed, 53 insertions(+), 5 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index b4eb475c23baa..d99ae59ddd4a8 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -57,6 +57,50 @@ def test_guided_json_completion( jsonschema.validate(instance=output_json, schema=sample_json_schema) +@pytest.mark.skip_global_cleanup +@pytest.mark.parametrize("guided_decoding_backend", + GUIDED_DECODING_BACKENDS_V1) +@pytest.mark.parametrize("model_name", MODELS_TO_TEST) +def test_guided_json_completion_disable_any_whitespace( + monkeypatch: pytest.MonkeyPatch, + sample_json_schema: dict[str, Any], + guided_decoding_backend: str, + model_name: str, +): + if guided_decoding_backend != "xgrammar": + pytest.skip("disable-any-whitespace is only supported for xgrammar.") + guided_decoding_backend = 'xgrammar:disable-any-whitespace' + + monkeypatch.setenv("VLLM_USE_V1", "1") + llm = LLM(model=model_name, + max_model_len=1024, + guided_decoding_backend=guided_decoding_backend) + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=1000, + guided_decoding=GuidedDecodingParams(json=sample_json_schema)) + outputs = llm.generate(prompts=[ + f"Give an example JSON for an employee profile " + f"that fits this schema: {sample_json_schema}" + ] * 2, + sampling_params=sampling_params, + use_tqdm=True) + + assert outputs is not None + + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + prompt = output.prompt + + generated_text = output.outputs[0].text + assert generated_text is not None + assert "\n" not in generated_text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + output_json = json.loads(generated_text) + jsonschema.validate(instance=output_json, schema=sample_json_schema) + + @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS_V1) @@ -301,7 +345,6 @@ def test_guided_choice_completion( prompts="The best language for type-safe systems programming is ", sampling_params=sampling_params, use_tqdm=True) - assert outputs is not None for output in outputs: assert output is not None diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e396e68f823d9..35c60a602665a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1486,7 +1486,9 @@ class EngineArgs: return False # Only support Xgrammar for guided decoding so far. - SUPPORTED_GUIDED_DECODING = ["xgrammar", "xgrammar:nofallback"] + SUPPORTED_GUIDED_DECODING = [ + "xgrammar", "xgrammar:disable-any-whitespace" + ] if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING: _raise_or_fallback(feature_name="--guided-decoding-backend", recommend_to_remove=False) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 55e0fdcd65b5e..8ba06336be024 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -120,7 +120,7 @@ class Processor: if not params.guided_decoding or not self.decoding_config: return - supported_backends = ["xgrammar"] + supported_backends = ["xgrammar", "xgrammar:disable-any-whitespace"] engine_level_backend = self.decoding_config.guided_decoding_backend if engine_level_backend not in supported_backends: raise ValueError(f"Only {supported_backends} structured output is " diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index ce93ca5c751be..9bfb644c58094 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -26,6 +26,9 @@ class XgrammarBackend(StructuredOutputBackend): def __init__(self, vllm_config: VllmConfig): self.vllm_config = vllm_config + self.disable_any_whitespace = ( + "disable-any-whitespace" + in vllm_config.decoding_config.guided_decoding_backend) tokenizer_group = init_tokenizer_from_configs( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, @@ -74,8 +77,8 @@ class XgrammarBackend(StructuredOutputBackend): def compile_grammar(self, request_type: StructuredOutputOptions, grammar_spec: str) -> StructuredOutputGrammar: if request_type == StructuredOutputOptions.JSON: - ctx = self.compiler.compile_json_schema(grammar_spec, - any_whitespace=False) + ctx = self.compiler.compile_json_schema( + grammar_spec, any_whitespace=not self.disable_any_whitespace) elif request_type == StructuredOutputOptions.JSON_OBJECT: ctx = self.compiler.compile_builtin_json_grammar() elif request_type == StructuredOutputOptions.GRAMMAR: From dd861b992f33f3f4a06ee91cf3346b038929d000 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= Date: Sun, 23 Mar 2025 00:05:03 +0800 Subject: [PATCH 39/40] [BugFix][Typing] Fix Imprecise Type Annotations (#15208) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Wang Ran (汪然) --- vllm/v1/engine/core_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 5ed4645797846..13b72c80dc0d4 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -212,9 +212,9 @@ class BackgroundResources: """Used as a finalizer for clean shutdown, avoiding circular reference back to the client object.""" - ctx: Union[zmq.Context] = None - output_socket: Union[zmq.Socket, zmq.asyncio.Socket] = None - input_socket: Union[zmq.Socket, zmq.asyncio.Socket] = None + ctx: zmq.Context + output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None + input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None proc_handle: Optional[BackgroundProcHandle] = None shutdown_path: Optional[str] = None From b877031d806e3d9ebc834e0191ae64de40c4ddc2 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Sat, 22 Mar 2025 17:06:39 -0400 Subject: [PATCH 40/40] Remove openvino support in favor of external plugin (#15339) Signed-off-by: Russell Bryant --- .buildkite/run-openvino-test.sh | 16 - Dockerfile.openvino | 29 - docs/source/getting_started/installation.md | 1 - .../installation/ai_accelerator.md | 77 --- .../ai_accelerator/openvino.inc.md | 110 ---- requirements/openvino.txt | 8 - setup.py | 10 +- tests/conftest.py | 3 +- tests/kernels/test_attention_selector.py | 14 +- tests/utils.py | 2 +- vllm/attention/backends/openvino.py | 146 ----- vllm/config.py | 2 +- vllm/engine/arg_utils.py | 1 - vllm/envs.py | 28 +- vllm/model_executor/model_loader/openvino.py | 204 ------ vllm/platforms/__init__.py | 17 - vllm/platforms/interface.py | 5 - vllm/platforms/openvino.py | 152 ----- vllm/worker/openvino_model_runner.py | 372 ----------- vllm/worker/openvino_worker.py | 600 ------------------ 20 files changed, 8 insertions(+), 1789 deletions(-) delete mode 100755 .buildkite/run-openvino-test.sh delete mode 100644 Dockerfile.openvino delete mode 100644 docs/source/getting_started/installation/ai_accelerator/openvino.inc.md delete mode 100644 requirements/openvino.txt delete mode 100644 vllm/attention/backends/openvino.py delete mode 100644 vllm/model_executor/model_loader/openvino.py delete mode 100644 vllm/platforms/openvino.py delete mode 100644 vllm/worker/openvino_model_runner.py delete mode 100644 vllm/worker/openvino_worker.py diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh deleted file mode 100755 index a1103bed66ecb..0000000000000 --- a/.buildkite/run-openvino-test.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -# This script build the OpenVINO docker image and run the offline inference inside the container. -# It serves a sanity check for compilation and basic model usage. -set -ex - -# Try building the docker image -docker build -t openvino-test -f Dockerfile.openvino . - -# Setup cleanup -remove_docker_container() { docker rm -f openvino-test || true; } -trap remove_docker_container EXIT -remove_docker_container - -# Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m diff --git a/Dockerfile.openvino b/Dockerfile.openvino deleted file mode 100644 index 445c70ab89d48..0000000000000 --- a/Dockerfile.openvino +++ /dev/null @@ -1,29 +0,0 @@ -# The vLLM Dockerfile is used to construct vLLM image that can be directly used -# to run the OpenAI compatible server. - -FROM ubuntu:22.04 AS dev - -RUN apt-get update -y && \ - apt-get install -y \ - git python3-pip \ - ffmpeg libsm6 libxext6 libgl1 -WORKDIR /workspace - -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi - -RUN python3 -m pip install -U pip -# install build requirements -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt -# build vLLM with OpenVINO backend -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace - -COPY examples/ /workspace/examples -COPY benchmarks/ /workspace/benchmarks - -# install development dependencies (for testing) -RUN python3 -m pip install -e tests/vllm_test_utils - -CMD ["/bin/bash"] diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md index af9fd495a7299..44134bf01b76c 100644 --- a/docs/source/getting_started/installation.md +++ b/docs/source/getting_started/installation.md @@ -26,4 +26,3 @@ installation/ai_accelerator - Google TPU - Intel Gaudi - AWS Neuron - - OpenVINO diff --git a/docs/source/getting_started/installation/ai_accelerator.md b/docs/source/getting_started/installation/ai_accelerator.md index 61a853ccefd2d..0a207af1a4c75 100644 --- a/docs/source/getting_started/installation/ai_accelerator.md +++ b/docs/source/getting_started/installation/ai_accelerator.md @@ -36,16 +36,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you :::: -::::{tab-item} OpenVINO -:sync: openvino - -:::{include} ai_accelerator/openvino.inc.md -:start-after: "# Installation" -:end-before: "## Requirements" -::: - -:::: - ::::: ## Requirements @@ -83,16 +73,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you :::: -::::{tab-item} OpenVINO -:sync: openvino - -:::{include} ai_accelerator/openvino.inc.md -:start-after: "## Requirements" -:end-before: "## Set up using Python" -::: - -:::: - ::::: ## Configure a new environment @@ -130,14 +110,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you :::: -::::{tab-item} OpenVINO -:sync: openvino - -:::{include} python_env_setup.inc.md -::: - -:::: - ::::: ## Set up using Python @@ -177,16 +149,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you :::: -::::{tab-item} OpenVINO -:sync: openvino - -:::{include} ai_accelerator/openvino.inc.md -:start-after: "### Pre-built wheels" -:end-before: "### Build wheel from source" -::: - -:::: - ::::: ### Build wheel from source @@ -224,16 +186,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you :::: -::::{tab-item} OpenVINO -:sync: openvino - -:::{include} ai_accelerator/openvino.inc.md -:start-after: "### Build wheel from source" -:end-before: "## Set up using Docker" -::: - -:::: - ::::: ## Set up using Docker @@ -273,16 +225,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you :::: -::::{tab-item} OpenVINO -:sync: openvino - -:::{include} ai_accelerator/openvino.inc.md -:start-after: "### Pre-built images" -:end-before: "### Build image from source" -::: - -:::: - ::::: ### Build image from source @@ -320,16 +262,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you :::: -::::{tab-item} OpenVINO -:sync: openvino - -:::{include} ai_accelerator/openvino.inc.md -:start-after: "### Build image from source" -:end-before: "## Extra information" -::: - -:::: - ::::: ## Extra information @@ -364,13 +296,4 @@ vLLM is a Python library that supports the following AI accelerators. Select you :::: -::::{tab-item} OpenVINO -:sync: openvino - -:::{include} ai_accelerator/openvino.inc.md -:start-after: "## Extra information" -::: - -:::: - ::::: diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md deleted file mode 100644 index ab0db4795da77..0000000000000 --- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md +++ /dev/null @@ -1,110 +0,0 @@ -# Installation - -vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). - -:::{attention} -There are no pre-built wheels or images for this device, so you must build vLLM from source. -::: - -## Requirements - -- OS: Linux -- Instruction set architecture (ISA) requirement: at least AVX2. - -## Set up using Python - -### Pre-built wheels - -Currently, there are no pre-built OpenVINO wheels. - -### Build wheel from source - -First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run: - -```console -sudo apt-get update -y -sudo apt-get install python3 -pip install --upgrade pip -``` - -Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend installation: - -```console -git clone https://github.com/vllm-project/vllm.git -cd vllm -pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu -``` - -Finally, install vLLM with OpenVINO backend: - -```console -PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . -``` - -:::{tip} -To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). -::: - -## Set up using Docker - -### Pre-built images - -Currently, there are no pre-built OpenVINO images. - -### Build image from source - -```console -docker build -f Dockerfile.openvino -t vllm-openvino-env . -docker run -it --rm vllm-openvino-env -``` - -## Extra information - -## Supported features - -OpenVINO vLLM backend supports the following advanced vLLM features: - -- Prefix caching (`--enable-prefix-caching`) -- Chunked prefill (`--enable-chunked-prefill`) - -## Performance tips - -### vLLM OpenVINO backend environment variables - -- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default. -- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `` - -### CPU performance tips - -CPU uses the following environment variables to control behavior: - -- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. -- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform. - -To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`) - -OpenVINO best known configuration for CPU is: - -```console -$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ - python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256 -``` - -### GPU performance tips - -GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache). - -Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`. - -OpenVINO best known configuration for GPU is: - -```console -$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ - python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json -``` - -## Limitations - -- LoRA serving is not supported. -- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration. -- Tensor and pipeline parallelism are not currently enabled in vLLM integration. diff --git a/requirements/openvino.txt b/requirements/openvino.txt deleted file mode 100644 index 04b8c3b009a23..0000000000000 --- a/requirements/openvino.txt +++ /dev/null @@ -1,8 +0,0 @@ -# Common dependencies --r common.txt - -torch == 2.5.1 # should be aligned with "common" vLLM torch version -openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention - -optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version -optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version diff --git a/setup.py b/setup.py index d412f34b3e3dc..6c45413c321ba 100755 --- a/setup.py +++ b/setup.py @@ -449,10 +449,6 @@ def _is_cpu() -> bool: return VLLM_TARGET_DEVICE == "cpu" -def _is_openvino() -> bool: - return VLLM_TARGET_DEVICE == "openvino" - - def _is_xpu() -> bool: return VLLM_TARGET_DEVICE == "xpu" @@ -572,8 +568,6 @@ def get_vllm_version() -> str: if gaudi_sw_version != MAIN_CUDA_VERSION: gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3] version += f"{sep}gaudi{gaudi_sw_version}" - elif _is_openvino(): - version += f"{sep}openvino" elif _is_tpu(): version += f"{sep}tpu" elif _is_cpu(): @@ -623,8 +617,6 @@ def get_requirements() -> list[str]: requirements = _read_requirements("neuron.txt") elif _is_hpu(): requirements = _read_requirements("hpu.txt") - elif _is_openvino(): - requirements = _read_requirements("openvino.txt") elif _is_tpu(): requirements = _read_requirements("tpu.txt") elif _is_cpu(): @@ -634,7 +626,7 @@ def get_requirements() -> list[str]: else: raise ValueError( "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, " - "OpenVINO, or CPU.") + "or CPU.") return requirements diff --git a/tests/conftest.py b/tests/conftest.py index 0c71d98101647..cc48fceb8eff0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -273,8 +273,7 @@ class HfRunner: def get_default_device(self): from vllm.platforms import current_platform - return ("cpu" if current_platform.is_cpu() - or current_platform.is_openvino() else "cuda") + return ("cpu" if current_platform.is_cpu() else "cuda") def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: if x is None or isinstance(x, (bool, )): diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 1615c23a4f714..a51e70d45ee0c 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import Mock, patch +from unittest.mock import patch import pytest import torch @@ -8,7 +8,6 @@ import torch from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform -from vllm.platforms.openvino import OpenVinoPlatform from vllm.platforms.rocm import RocmPlatform from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL @@ -21,9 +20,9 @@ def clear_cache(): @pytest.mark.parametrize( - "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"]) + "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"]) @pytest.mark.parametrize("use_v1", [True, False]) -@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"]) +@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) def test_env( name: str, use_v1: bool, @@ -51,13 +50,6 @@ def test_env( 16, False) EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH" assert backend.get_name() == EXPECTED - elif device == "openvino": - with patch("vllm.attention.selector.current_platform", - OpenVinoPlatform()), patch.dict('sys.modules', - {'openvino': Mock()}): - backend = get_attn_backend(16, torch.float16, torch.float16, - 16, False) - assert backend.get_name() == "OPENVINO" else: if name in ["XFORMERS", "FLASHINFER"]: with patch("vllm.attention.selector.current_platform", diff --git a/tests/utils.py b/tests/utils.py index 627cf567afcca..a827b6d4b9bfe 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -786,7 +786,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator: without enough resources, or called when filtering tests to run directly. """ try: - if current_platform.is_cpu() or current_platform.is_openvino(): + if current_platform.is_cpu(): memory_gb = 0 else: memory_gb = current_platform.get_device_total_memory() / GB_bytes diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py deleted file mode 100644 index 9908620a32a23..0000000000000 --- a/vllm/attention/backends/openvino.py +++ /dev/null @@ -1,146 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Type - -import openvino as ov -import torch - -from vllm.attention.backends.abstract import (AttentionBackend, - AttentionMetadata) -from vllm.attention.backends.utils import CommonAttentionState -from vllm.multimodal import MultiModalPlaceholderMap - - -def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor, - src_offset: int, dst_offset: int) -> None: - - def create_roi_tensor( - tensor: ov.Tensor, - block_number: int, - ) -> ov.Tensor: - roi_begin = ov.runtime.Coordinate([0, 0, 0, 0]) - roi_end = ov.runtime.Coordinate(tensor.get_shape()) - - roi_begin[0] = block_number - roi_end[0] = block_number + 1 - - if isinstance(tensor, ov.Tensor): - return ov.Tensor(tensor, roi_begin, roi_end) - else: - return ov.RemoteTensor(tensor, roi_begin, roi_end) - - src_roi_tensor = \ - create_roi_tensor(src_tensor, src_offset) - dst_roi_tensor = \ - create_roi_tensor(dst_tensor, dst_offset) - src_roi_tensor.copy_to(dst_roi_tensor) - - -class OpenVINOAttentionBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "OPENVINO" - - @staticmethod - def get_impl_cls(): - # OpenVINO implements PagedAttention as part of the Optimum - # exported model - raise NotImplementedError - - @staticmethod - def make_metadata(*args, **kwargs) -> "AttentionMetadata": - raise NotImplementedError - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def make_openvino_metadata(*args, **kwargs) -> "OpenVINOAttentionMetadata": - return OpenVINOAttentionMetadata(*args, **kwargs) - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return (2, num_blocks, num_kv_heads, block_size, head_size) - - @staticmethod - def swap_blocks( - src_tensor: ov.Tensor, - dst_tensor: ov.Tensor, - src_to_dists: List[Tuple[int, int]], - ) -> None: - for src, dst in src_to_dists: - copy_cache_block(src_tensor, dst_tensor, src, dst) - - @staticmethod - def copy_blocks( - kv_caches: List[Tuple[ov.Tensor, ov.Tensor]], - src_to_dists: List[Tuple[int, int]], - ) -> None: - for src, dst in src_to_dists: - for key_cache, value_cache in kv_caches: - copy_cache_block(key_cache, key_cache, src, dst) - copy_cache_block(value_cache, value_cache, src, dst) - - -@dataclass -class OpenVINOAttentionMetadata: - """Metadata for OpenVINOAttentionBackend. - - Basic terms used below: - - batch_size_in_sequences - total number of sequences to execute​ - - prompt_lens – per sequence size number of scheduled tokens​ - - batch_size_in_tokens = sum(prompt_lens)​ - - max_context_len = max(context_lens)​ - - max_num_blocks = div_up(max_context_len / BLOCK_SIZE)​ - - num_blocks – total number of blocks in block_indices​ - """ - - # Describes past KV cache size for each sequence within a batch - # Shape: [batch_size_in_sequences] - # Type: i32​ - past_lens: torch.Tensor - - # Describes start indices of input / speculative tokens from - # current sequences within a batch sequence​ - # Shape: [batch_size_in_sequences + 1]​ - # Type: i32 - subsequence_begins: torch.Tensor - - # Describes block tables for each sequence within a batch​ - - # indices along 0th dimension in key_cache and value_cache inputs​ - # Shape: [num_blocks] - # Type: i32​ - block_indices: torch.Tensor - - # Describes block tables for each sequence within a batch​ - - # for i-th element, it is an index in block_indices with the - # first block belonging to i-th sequence​ - # Shape: [batch_size_in_sequences + 1] - # Type: i32​ - block_indices_begins: torch.Tensor - - # Describes max context length - # Shape: scalar - # Type: i32 - max_context_len: torch.Tensor - - # The index maps that relate multi-modal embeddings to the corresponding - # placeholders. - # - # N.B. These aren't really related to attention and don't belong on this - # type -- this is just a temporary solution to make them available to - # `model_executable`. - multi_modal_placeholder_index_maps: Optional[Dict[ - str, MultiModalPlaceholderMap.IndexMap]] - - # Enable/disable KV scales calculation. This is so that we can disable the - # calculation until after prefill and cuda graph capture. - enable_kv_scales_calculation: bool diff --git a/vllm/config.py b/vllm/config.py index 181fa803c620b..42f517e49a11b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1801,7 +1801,7 @@ class DeviceConfig: self.device_type = device # Some device types require processing inputs on CPU - if self.device_type in ["neuron", "openvino"]: + if self.device_type in ["neuron"]: self.device = torch.device("cpu") elif self.device_type in ["tpu"]: self.device = None diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 35c60a602665a..5d06a86e6950d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -40,7 +40,6 @@ DEVICE_OPTIONS = [ "cuda", "neuron", "cpu", - "openvino", "tpu", "xpu", "hpu", diff --git a/vllm/envs.py b/vllm/envs.py index 7c07940c26c26..829f9450fb773 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -40,10 +40,6 @@ if TYPE_CHECKING: VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" VLLM_CPU_MOE_PREPACK: bool = True - VLLM_OPENVINO_DEVICE: str = "CPU" - VLLM_OPENVINO_KVCACHE_SPACE: int = 0 - VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None - VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") VLLM_XLA_CHECK_RECOMPILATION: bool = False VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024 @@ -131,7 +127,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # ================== Installation Time Env Vars ================== # Target device of vLLM, supporting [cuda (by default), - # rocm, neuron, cpu, openvino] + # rocm, neuron, cpu] "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"), @@ -358,28 +354,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_CPU_MOE_PREPACK": lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))), - # OpenVINO device selection - # default is CPU - "VLLM_OPENVINO_DEVICE": - lambda: os.getenv("VLLM_OPENVINO_DEVICE", "CPU").upper(), - - # OpenVINO key-value cache space - # default is 4GB - "VLLM_OPENVINO_KVCACHE_SPACE": - lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")), - - # OpenVINO KV cache precision - # default is bf16 if natively supported by platform, otherwise f16 - # To enable KV cache compression, please, explicitly specify u8 - "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION": - lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None), - - # Enables weights compression during model export via HF Optimum - # default is False - "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS": - lambda: - (os.environ.get("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", "0").lower() in - ("on", "true", "1")), # If the env var is set, then all workers will execute as separate # processes from the engine, and we use the same mechanism to trigger # execution on all workers. diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py deleted file mode 100644 index cd2d427edbbd1..0000000000000 --- a/vllm/model_executor/model_loader/openvino.py +++ /dev/null @@ -1,204 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -# ruff: noqa: SIM117 -from pathlib import Path -from typing import Optional - -import openvino as ov -import torch -from huggingface_hub import HfApi -from openvino._offline_transformations import paged_attention_transformation -from optimum.intel import OVModelForCausalLM -from torch import nn - -import vllm.envs as envs -from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config -from vllm.forward_context import get_forward_context -from vllm.logger import init_logger -from vllm.model_executor.layers.logits_processor import (LogitsProcessor, - _prune_hidden_states) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.platforms import current_platform - -logger = init_logger(__name__) - - -def _flatten_inputs(inputs): - """ - Helper function for making nested inputs flattens - """ - flatten_inputs = [] - for input_data in inputs: - if input_data is None: - continue - if isinstance(input_data, (list, tuple)): - flatten_inputs.extend(_flatten_inputs(input_data)) - elif isinstance(input_data, dict): - flatten_inputs.extend(_flatten_inputs(list(input_data.values()))) - else: - flatten_inputs.append(input_data) - return flatten_inputs - - -def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type, - is_cpu: bool): - # Apply hardware dependent modifications to KV tensors - for parameter in model.get_parameters(): - input = parameter.get_output_tensor(0) - input_names = input.get_names() - if len(input_names) != 1: - continue - input_name = next(iter(input_names)) - shape = parameter.get_partial_shape() - # use real block size if available, just a placeholder - # to provide the expected rank - num_blocks = ov.Dimension() - block_size = ov.Dimension() - head_size = ov.Dimension() - if input_name.startswith("key_cache."): - cpu_shape = [num_blocks, shape[1], block_size, head_size] - gpu_shape = [num_blocks, shape[1], shape[2], block_size] - elif input_name.startswith("value_cache."): - cpu_shape = [num_blocks, shape[1], block_size, head_size] - gpu_shape = [num_blocks, shape[1], block_size, shape[2]] - else: - continue - parameter.set_partial_shape( - ov.PartialShape(cpu_shape if is_cpu else gpu_shape)) - parameter.set_element_type(kv_cache_dtype) - model.validate_nodes_and_infer_types() - - -def _require_model_export(model_id, revision=None, subfolder=None): - model_dir = Path(model_id) - if subfolder is not None: - model_dir = model_dir / subfolder - if model_dir.is_dir(): - return (not (model_dir / "openvino_model.xml").exists() - or not (model_dir / "openvino_model.bin").exists()) - - hf_api = HfApi() - try: - model_info = hf_api.model_info(model_id, revision=revision or "main") - normalized_subfolder = (None if subfolder is None else - Path(subfolder).as_posix()) - model_files = [ - file.rfilename for file in model_info.siblings - if normalized_subfolder is None - or file.rfilename.startswith(normalized_subfolder) - ] - ov_model_path = ("openvino_model.xml" if normalized_subfolder is None - else f"{normalized_subfolder}/openvino_model.xml") - return (ov_model_path not in model_files - or ov_model_path.replace(".xml", ".bin") not in model_files) - except Exception: - return True - - -class OpenVINOCausalLM(nn.Module): - - def __init__( - self, - ov_core: ov.Core, - model_config: ModelConfig, - kv_cache_dtype: ov.Type, - ) -> None: - super().__init__() - self.logits_processor = LogitsProcessor( - model_config.hf_config.vocab_size, logits_as_input=True) - self.sampler = Sampler() - - export = _require_model_export(model_config.model) - if export: - logger.warning( - f"Provided model id {model_config.model} does not " # noqa: G004 - "contain OpenVINO IR, the model will be converted to IR with " - "default options. If you need to use specific options for " - "model conversion, use optimum-cli export openvino with " - "desired options.") - else: - logger.warning( - "OpenVINO IR is available for provided model id " # noqa: G004 - f"{model_config.model}. This IR will be used for inference " - "as-is, all possible options that may affect model conversion " - "are ignored.") - - load_in_8bit = (envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS - if export else False) - pt_model = OVModelForCausalLM.from_pretrained( - model_config.model, - export=export, - compile=False, - load_in_8bit=load_in_8bit, - trust_remote_code=model_config.trust_remote_code, - ) - - ov_device = envs.VLLM_OPENVINO_DEVICE - paged_attention_transformation(pt_model.model) - _modify_cache_parameters(pt_model.model, kv_cache_dtype, - current_platform.is_openvino_cpu()) - - ov_compiled = ov_core.compile_model(pt_model.model, ov_device) - self.ov_request = ov_compiled.create_infer_request() - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: list[tuple[ov.Tensor, ov.Tensor]], - ) -> torch.Tensor: - flat_kv_caches = _flatten_inputs(kv_caches) - attn_metadata = get_forward_context().attn_metadata - - inputs = [ - input_ids, - positions, - *flat_kv_caches, - attn_metadata.past_lens, - attn_metadata.subsequence_begins, - attn_metadata.block_indices, - attn_metadata.block_indices_begins, - attn_metadata.max_context_len, - ] - - self.ov_request.start_async(inputs, share_inputs=True) - self.ov_request.wait() - - logits = torch.from_numpy(self.ov_request.get_tensor("logits").data) - - # TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension - return logits.view(-1, logits.shape[-1]) - - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - hidden_states = _prune_hidden_states(hidden_states, sampling_metadata) - logits = self.logits_processor(None, hidden_states, sampling_metadata) - return logits - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - - -def get_model( - vllm_config: VllmConfig, - kv_cache_dtype: ov.Type, - **kwargs, -) -> torch.nn.Module: - lora_config = kwargs.get("lora_config") - ov_core = kwargs.get("ov_core") - if lora_config: - raise ValueError( - "OpenVINO modeling does not support LoRA, " - "but LoRA is enabled. Support for this model may " - "be added in the future. If this is important to you, " - "please open an issue on github.") - - with set_current_vllm_config(vllm_config): - return OpenVINOCausalLM(ov_core, vllm_config.model_config, - kv_cache_dtype) diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 4912906fef465..0ed221043171d 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -2,7 +2,6 @@ import logging import traceback -from contextlib import suppress from itertools import chain from typing import TYPE_CHECKING, Optional @@ -191,21 +190,6 @@ def neuron_platform_plugin() -> Optional[str]: return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None -def openvino_platform_plugin() -> Optional[str]: - is_openvino = False - logger.debug("Checking if OpenVINO platform is available.") - with suppress(Exception): - is_openvino = vllm_version_matches_substr("openvino") - if is_openvino: - logger.debug("Confirmed OpenVINO platform is available" - " because vLLM is built with OpenVINO.") - if not is_openvino: - logger.debug("OpenVINO platform is not available because" - " vLLM is not built with OpenVINO.") - - return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None - - builtin_platform_plugins = { 'tpu': tpu_platform_plugin, 'cuda': cuda_platform_plugin, @@ -214,7 +198,6 @@ builtin_platform_plugins = { 'xpu': xpu_platform_plugin, 'cpu': cpu_platform_plugin, 'neuron': neuron_platform_plugin, - 'openvino': openvino_platform_plugin, } diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index d3bffaf4d69bf..9981deee39b72 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -33,7 +33,6 @@ class _Backend(enum.Enum): XFORMERS = enum.auto() ROCM_FLASH = enum.auto() TORCH_SDPA = enum.auto() - OPENVINO = enum.auto() FLASHINFER = enum.auto() TRITON_MLA = enum.auto() # Supported by V1 FLASHMLA = enum.auto() # Supported by V1 @@ -53,7 +52,6 @@ class PlatformEnum(enum.Enum): XPU = enum.auto() CPU = enum.auto() NEURON = enum.auto() - OPENVINO = enum.auto() OOT = enum.auto() UNSPECIFIED = enum.auto() @@ -136,9 +134,6 @@ class Platform: def is_neuron(self) -> bool: return self._enum == PlatformEnum.NEURON - def is_openvino(self) -> bool: - return self._enum == PlatformEnum.OPENVINO - def is_out_of_tree(self) -> bool: return self._enum == PlatformEnum.OOT diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py deleted file mode 100644 index f385064875ca7..0000000000000 --- a/vllm/platforms/openvino.py +++ /dev/null @@ -1,152 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from typing import TYPE_CHECKING, Optional - -import torch - -import vllm.envs as envs -from vllm.logger import init_logger - -from .interface import Platform, PlatformEnum, _Backend - -if TYPE_CHECKING: - from vllm.config import VllmConfig -else: - VllmConfig = None - -logger = init_logger(__name__) - -try: - import openvino as ov - import openvino.properties.hint as hints -except ImportError as e: - logger.warning("Failed to import OpenVINO with %r", e) - - -class OpenVinoPlatform(Platform): - _enum = PlatformEnum.OPENVINO - device_name: str = "openvino" - device_type: str = "openvino" - dispatch_key: str = "CPU" - - @classmethod - def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, - dtype: torch.dtype, kv_cache_dtype: Optional[str], - block_size: int, use_v1: bool, - use_mla: bool) -> str: - if selected_backend != _Backend.OPENVINO: - logger.info("Cannot use %s backend on OpenVINO.", selected_backend) - logger.info("Using OpenVINO Attention backend.") - return "vllm.attention.backends.openvino.OpenVINOAttentionBackend" - - @classmethod - def get_device_name(cls, device_id: int = 0) -> str: - return "openvino" - - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - return False - - @classmethod - def inference_mode(cls): - return torch.inference_mode(mode=True) - - @classmethod - def is_openvino_cpu(cls) -> bool: - return "CPU" in envs.VLLM_OPENVINO_DEVICE - - @classmethod - def is_openvino_gpu(cls) -> bool: - return "GPU" in envs.VLLM_OPENVINO_DEVICE - - @classmethod - def is_pin_memory_available(cls) -> bool: - logger.warning("Pin memory is not supported on OpenViNO.") - return False - - @classmethod - def check_and_update_config(cls, vllm_config: VllmConfig) -> None: - from vllm.utils import GiB_bytes - - parallel_config = vllm_config.parallel_config - assert (parallel_config.world_size == 1 - ), "OpenVINO only supports single CPU socket currently." - - if parallel_config.worker_cls == "auto": - parallel_config.worker_cls = \ - "vllm.worker.openvino_worker.OpenVINOWorker" - - # check and update model config - model_config = vllm_config.model_config - if model_config.dtype != torch.float32: - logger.warning( - f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}." # noqa: G004, E501 - ) - model_config.dtype = torch.float32 - if not model_config.enforce_eager: - logger.warning( - "CUDA graph is not supported on OpenVINO backend, fallback to " - "the eager mode.") - model_config.enforce_eager = True - - # check and update cache config - ov_core = ov.Core() - cache_config = vllm_config.cache_config - if cache_config and cache_config.block_size is None: - cache_config.block_size = 16 - - if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8": - if not OpenVinoPlatform.is_openvino_cpu(): - logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is " - "ignored for GPU, f16 data type will be used.") - cache_config.cache_dtype = ov.Type.f16 - else: - logger.info("KV cache type is overridden to u8 via " - "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.") - cache_config.cache_dtype = ov.Type.u8 - else: - if OpenVinoPlatform.is_openvino_cpu(): - ov_device = envs.VLLM_OPENVINO_DEVICE - inference_precision = ov_core.get_property( - ov_device, hints.inference_precision) - if inference_precision == ov.Type.bf16: - cache_config.cache_dtype = ov.Type.bf16 - else: - cache_config.cache_dtype = ov.Type.f16 - else: - cache_config.cache_dtype = ov.Type.f16 - - if OpenVinoPlatform.is_openvino_cpu(): - if cache_config.block_size != 32: - logger.info( - f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}" # noqa: G004, E501 - ) - cache_config.block_size = 32 - else: - if cache_config.block_size != 16: - logger.info( - f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}" # noqa: G004, E501 - ) - cache_config.block_size = 16 - - kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE - if kv_cache_space >= 0: - if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu(): - cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes # type: ignore - logger.warning( - "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) " - "for OpenVINO backend is not set, using 4 by default.") - else: - cache_config.openvino_kvcache_space_bytes = ( # type: ignore - kv_cache_space * GiB_bytes) - else: - raise RuntimeError( - "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE" - f" {kv_cache_space}, expect a positive integer value.") - - assert vllm_config.device_config.device_type == "openvino" - assert vllm_config.lora_config is None, \ - "OpenVINO backend doesn't support LoRA" - assert cls.is_openvino_cpu() or \ - cls.is_openvino_gpu(), \ - "OpenVINO backend supports only CPU and GPU devices" diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py deleted file mode 100644 index aa1d2cbb2df29..0000000000000 --- a/vllm/worker/openvino_model_runner.py +++ /dev/null @@ -1,372 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -from collections import defaultdict -from typing import Dict, List, NamedTuple, Optional, Tuple - -import openvino as ov -import torch -from torch import nn - -from vllm.attention import get_attn_backend -from vllm.attention.backends.openvino import OpenVINOAttentionMetadata -from vllm.config import VllmConfig -from vllm.forward_context import set_forward_context -from vllm.logger import init_logger -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.openvino import get_model -from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalKwargs, MultiModalPlaceholderMap) -from vllm.sequence import SequenceGroupMetadata -from vllm.worker.model_runner_base import ModelRunnerBase - -logger = init_logger(__name__) - - -class ModelInput(NamedTuple): - input_tokens: torch.Tensor - input_positions: torch.Tensor - attn_metadata: Optional[OpenVINOAttentionMetadata] - seq_lens: List[int] - query_lens: List[int] - multi_modal_kwargs: BatchedTensorInputs - - @classmethod - def empty(cls, device): - return ModelInput(input_tokens=torch.empty(0, device=device), - input_positions=torch.empty(0, device=device), - attn_metadata=None, - seq_lens=[], - query_lens=[], - multi_modal_kwargs={}) - - -class OpenVINOModelRunner(ModelRunnerBase): - - def __init__( - self, - ov_core: ov.Core, - vllm_config: VllmConfig, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - *args, - **kwargs, - ): - self.ov_core = ov_core - ModelRunnerBase.__init__(self, vllm_config=vllm_config) - self.is_driver_worker = is_driver_worker - - self.device = self.device_config.device - - self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = self.model_config.get_sliding_window() - self.block_size = self.cache_config.block_size - - self.attn_backend = get_attn_backend( - self.model_config.get_head_size(), - self.model_config.dtype, - self.kv_cache_dtype, - self.block_size, - self.model_config.is_attention_free, - ) - - # Multi-modal data support - self.mm_registry = MULTIMODAL_REGISTRY - self.multi_modal_input_mapper = self.mm_registry \ - .create_input_mapper(self.model_config) - - # Lazy initialization. - self.model: nn.Module # Set after init_Model - - def load_model(self) -> None: - self.model = get_model(vllm_config=self.vllm_config, - kv_cache_dtype=self.kv_cache_dtype, - ov_core=self.ov_core) - - def get_model(self) -> nn.Module: - return self.model - - def _prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> ModelInput: - """Prepare the model input based on a given sequence group. - - The API assumes seq_group_metadata_list is sorted by prefill -> decode. - - The result tensors and data structure also batches input in prefill - -> decode order. For example, - - - input_tokens[:num_prefill_tokens] contains prefill tokens. - - input_tokens[num_prefill_tokens:] contains decode tokens. - """ - input_tokens: List[int] = [] - input_positions: List[int] = [] - - seq_lens: List[int] = [] - past_lens: List[int] = [] - query_lens: List[int] = [] - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - multi_modal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) - - subsequence_begins: List[int] = [] - block_indices: List[int] = [] - block_indices_begins: List[int] = [] - - # initialize beginning of prefix sums - subsequence_begins.append(0) - block_indices_begins.append(0) - - if len(seq_group_metadata_list) == 0: - return ModelInput.empty(self.device) - - for seq_group_metadata in seq_group_metadata_list: - seq_ids = list(seq_group_metadata.seq_data.keys()) - is_prompt = seq_group_metadata.is_prompt - - for seq_id in seq_ids: - computed_block_nums = seq_group_metadata.computed_block_nums - if (self.scheduler_config is not None - and self.scheduler_config.chunked_prefill_enabled - and not (computed_block_nums is None - or computed_block_nums == [])): - raise RuntimeError( - "chunked prefill cannot be used with prefix caching " - "now.") - - seq_data = seq_group_metadata.seq_data[seq_id] - if is_prompt: - computed_len = seq_data.get_num_computed_tokens() - else: - # get_num_computed_tokens is incorrect for spec decoding. - # So, we should have a special logic here. - # TODO(sang): Fix it. - computed_len = seq_data.get_len() - 1 - - seq_len = min( - seq_data.get_len(), - computed_len + seq_group_metadata.token_chunk_size, - ) - if is_prompt: - tokens = seq_data.get_token_ids()[computed_len:seq_len] - else: - # Optimization. get_token_ids requires the entire copy of - # tokens. - tokens = [seq_data.get_last_token_id()] - - # Prefix cache was hit. - # Prefix is not supported with sliding_window - prefix_cache_hit = (computed_block_nums is not None - and len(computed_block_nums) > 0 - and self.sliding_window is None - and is_prompt) - - block_table = seq_group_metadata.block_tables[seq_id] - # TODO(sang): Combine chunked prefill and prefix caching by - # only allowing multiple of block_size chunk size. - # NOTE: This only works for oooooooxxx style attention. - if prefix_cache_hit: - assert computed_block_nums is not None - computed_len = len(computed_block_nums) * self.block_size - tokens = tokens[computed_len:] - elif (self.scheduler_config.chunked_prefill_enabled - or not is_prompt): - if seq_group_metadata.block_tables is not None: - # chunked prefill or decode - block_table = seq_group_metadata.block_tables[seq_id] - if self.sliding_window is not None: - # chunked prefill doesn't support sliding window. - assert not self.scheduler_config.chunked_prefill_enabled # noqa: E501 - sliding_window_blocks = (self.sliding_window // - self.block_size) - block_table = block_table[-sliding_window_blocks:] - else: - # Only happens when memory profiling runs. - block_table = [] - else: - # prompt phase w/o prefix_caching, chunked_prefill - pass - - block_indices.extend(block_table) - block_indices_begins.append(block_indices_begins[-1] + - len(block_table)) - - # TODO(sang): This is a hack to make sliding window work with - # paged attn. We can remove it if we make paged attn kernel - # to properly handle slinding window attn. - if self.sliding_window is not None and not is_prompt: - seq_len = min(seq_len, self.sliding_window) - computed_len = seq_len - 1 - - seq_lens.append(seq_len) - - query_len = seq_len - computed_len - query_lens.append(query_len) - - input_tokens.extend(tokens) - positions_range = range(computed_len, seq_len) - input_positions.extend(list(positions_range)) - - past_lens.append(computed_len) - subsequence_begins.append(subsequence_begins[-1] + query_len) - - if is_prompt: - assert len(seq_ids) == 1 - else: - assert ( - query_len == 1 - ), "seq_len: {}, computed_len: {}, query_len: {}".format( - seq_len, computed_len, query_len) - - if seq_group_metadata.multi_modal_data: - # NOTE: mm_data only includes the subset of multi-modal - # items that intersect with the current prefill positions. - mm_data, placeholder_maps = MultiModalPlaceholderMap \ - .from_seq_group(seq_group_metadata, positions_range) - - if self.mm_registry.has_processor(self.model_config): - mm_kwargs = mm_data - else: - mm_kwargs = self.multi_modal_input_mapper( - mm_data, - seq_group_metadata.mm_processor_kwargs, - ) - - multi_modal_kwargs_list.append(mm_kwargs) - - for modality, placeholder_map in placeholder_maps.items(): - multi_modal_placeholder_maps[modality].extend( - placeholder_map, ) - - max_query_len = max(query_lens) - assert max_query_len > 0, "query_lens: {}".format(query_lens) - - input_tokens = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) # type: ignore - input_positions = torch.tensor(input_positions, - dtype=torch.long, - device=self.device) # type: ignore - - past_lens_tensor = torch.tensor(past_lens, - dtype=torch.int32, - device=self.device) # type: ignore - subsequence_begins_tensor = torch.tensor( - subsequence_begins, dtype=torch.int32, - device=self.device) # type: ignore - block_indices_tensor = torch.tensor(block_indices, - dtype=torch.int32, - device=self.device) # type: ignore - block_indices_begins_tensor = torch.tensor( - block_indices_begins, dtype=torch.int32, - device=self.device) # type: ignore - - max_context_len = max(seq_lens) - max_context_len_tensor = torch.tensor( - max_context_len, dtype=torch.int32, - device=self.device) # type: ignore - - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - multi_modal_placeholder_maps.items() - } - - attn_metadata = self.attn_backend.make_openvino_metadata( - past_lens=past_lens_tensor, - subsequence_begins=subsequence_begins_tensor, - block_indices=block_indices_tensor, - block_indices_begins=block_indices_begins_tensor, - max_context_len=max_context_len_tensor, - multi_modal_placeholder_index_maps=placeholder_index_maps, - enable_kv_scales_calculation=False, - ) - - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - return ModelInput( - input_tokens, - input_positions, - attn_metadata, - seq_lens, - query_lens, - multi_modal_kwargs=multi_modal_kwargs, - ) - - def prepare_input_tensors( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata, - SamplingMetadata, BatchedTensorInputs]: - # Prepare input tensors. - ( - input_tokens, - input_positions, - attn_metadata, - seq_lens, - query_lens, - multi_modal_kwargs, - ) = self._prepare_model_input(seq_group_metadata_list) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens, - self.device, - pin_memory=False, - ) - - return ( - input_tokens, - input_positions, - attn_metadata, - sampling_metadata, - multi_modal_kwargs, - ) - - @torch.inference_mode() - def execute_model( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - kv_caches: List[Tuple["ov.Tensor", "ov.Tensor"]], - ) -> Optional[SamplerOutput]: - ( - input_tokens, - input_positions, - attn_metadata, - sampling_metadata, - multi_modal_kwargs, - ) = self.prepare_input_tensors(seq_group_metadata_list) - - model_executable = self.model - execute_model_kwargs = { - "input_ids": - input_tokens, - "positions": - input_positions, - "kv_caches": - kv_caches, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs or {}, - device=self.device), - } - - with set_forward_context(attn_metadata, self.vllm_config, 0): - hidden_states = model_executable(**execute_model_kwargs) - - # Compute the logits. - logits = self.model.compute_logits(hidden_states, sampling_metadata) - - # Sample the next token. - output = self.model.sample( - logits=logits, - sampling_metadata=sampling_metadata, - ) - return output - - def prepare_model_input(self, *args, **kwargs): - raise NotImplementedError - - def make_model_input_from_broadcasted_tensor_dict(self, *args, **kwargs): - raise NotImplementedError diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py deleted file mode 100644 index fad91270ea2a4..0000000000000 --- a/vllm/worker/openvino_worker.py +++ /dev/null @@ -1,600 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -"""An OpenVINO worker class.""" -from typing import Any, Dict, List, Optional, Tuple - -import openvino as ov -import torch -import torch.distributed -import torch.nn as nn - -import vllm.envs as envs -from vllm.attention import get_attn_backend -from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, - ParallelConfig, VllmConfig) -from vllm.distributed import (broadcast_tensor_dict, - ensure_model_parallel_initialized, - init_distributed_environment) -from vllm.inputs import INPUT_REGISTRY -from vllm.logger import init_logger -from vllm.model_executor import set_random_seed -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.platforms import current_platform -from vllm.sampling_params import SamplingParams -from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata -from vllm.utils import bind_kv_cache -from vllm.worker.openvino_model_runner import OpenVINOModelRunner -from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase - -logger = init_logger(__name__) - - -class OpenVINOCacheEngine: - """Manages the KV cache for OpenVINO backend. - - This class is responsible for initializing and managing CPU KV - caches. It also provides methods for performing KV cache operations, such - as copying. - """ - - def __init__( - self, - cache_config: CacheConfig, - model_config: ModelConfig, - parallel_config: ParallelConfig, - device_config: DeviceConfig, - ov_core: ov.Core, - ov_device: str, - ) -> None: - assert device_config.device_type == "openvino" - self.cache_config = cache_config - self.model_config = model_config - self.parallel_config = parallel_config - - self.head_size = model_config.get_head_size() - if device_config.device.type == "cpu" and \ - cache_config.cache_dtype == ov.Type.u8: - # Scale, zero point and quantized data will be stored together. - # The layout for per token per head: - # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501 - # so, we have to extend head_size by 8, which is sizeof(float) - # for scale and sizeof(float) for zeropoint - self.head_size += 8 - self.num_layers = model_config.get_num_layers(parallel_config) - self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) - - self.block_size = cache_config.block_size - # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks - # for OpenVINO backend with a CPU target device, because we want - # to reuse KV cache management in the scheduler. - self.num_device_blocks = cache_config.num_gpu_blocks - self.num_swap_blocks = cache_config.num_cpu_blocks - - # Get attention backend. - self.attn_backend = get_attn_backend( - self.head_size, - self.model_config.dtype, - self.cache_config.cache_dtype, - self.block_size, - self.model_config.is_attention_free, - ) - - # Initialize the cache. - self.kv_cache: List[Tuple[ov.Tensor, - ov.Tensor]] = self._allocate_kv_cache( - self.num_device_blocks, ov_core, - ov_device) - - # Initialize the swap. - self.swap_cache: List[Tuple[ov.Tensor, - ov.Tensor]] = self._allocate_swap_cache( - self.num_swap_blocks, ov_device) - - def _allocate_kv_cache( - self, - num_blocks: int, - ov_core: ov.Core, - ov_device: str, - ) -> List[Tuple[ov.Tensor, ov.Tensor]]: - """Allocates KV cache.""" - k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape( - num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:] - kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = [] - - if current_platform.is_openvino_cpu(): - for _ in range(self.num_layers): - key_blocks = ov.Tensor(self.cache_config.cache_dtype, - k_block_shape) - value_blocks = ov.Tensor(self.cache_config.cache_dtype, - v_block_shape) - kv_cache.append((key_blocks, value_blocks)) - else: - # Update key_cache shape: - k_block_shape = (v_block_shape[0], v_block_shape[1], - v_block_shape[3], v_block_shape[2]) - - remote_context = ov_core.get_default_context(ov_device) - - for _ in range(self.num_layers): - key_blocks = \ - remote_context.create_tensor(self.cache_config.cache_dtype, - ov.Shape(k_block_shape), - {}) - - value_blocks = \ - remote_context.create_tensor(self.cache_config.cache_dtype, - ov.Shape(v_block_shape), - {}) - - kv_cache.append((key_blocks, value_blocks)) - - return kv_cache - - def _allocate_swap_cache( - self, - num_blocks: int, - ov_device: str, - ) -> List[Tuple[ov.Tensor, ov.Tensor]]: - """Allocates swap cache.""" - k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape( - num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:] - swap_cache: List[Tuple[ov.Tensor, ov.Tensor]] = [] - - if num_blocks == 0: - return swap_cache - - assert not current_platform.is_openvino_cpu(), \ - "CPU device isn't supposed to have swap cache" - - # Update key_cache shape: - k_block_shape = (v_block_shape[0], v_block_shape[1], v_block_shape[3], - v_block_shape[2]) - - for _ in range(self.num_layers): - key_blocks = ov.Tensor(self.cache_config.cache_dtype, - k_block_shape) - value_blocks = ov.Tensor(self.cache_config.cache_dtype, - v_block_shape) - swap_cache.append((key_blocks, value_blocks)) - - return swap_cache - - def swap_in(self, src_to_dst: List[Tuple[int, int]]) -> None: - for i in range(self.num_layers): - for swap_tensor, kv_tensor in zip(self.swap_cache[i], - self.kv_cache[i]): - self.attn_backend.swap_blocks(swap_tensor, kv_tensor, - src_to_dst) - - def swap_out(self, src_to_dst: List[Tuple[int, int]]) -> None: - for i in range(self.num_layers): - for swap_tensor, kv_tensor in zip(self.swap_cache[i], - self.kv_cache[i]): - self.attn_backend.swap_blocks(kv_tensor, swap_tensor, - src_to_dst) - - def copy(self, src_to_dsts: List[Tuple[int, int]]) -> None: - if (len(src_to_dsts) > 0): - self.attn_backend.copy_blocks(self.kv_cache, src_to_dsts) - - @staticmethod - def get_cache_block_size( - block_size: int, - cache_dtype: ov.Type, - model_config: ModelConfig, - parallel_config: ParallelConfig, - ) -> int: - head_size = model_config.get_head_size() - num_kv_heads = model_config.get_num_kv_heads(parallel_config) - num_layers = model_config.get_num_layers(parallel_config) - - if cache_dtype == ov.Type.u8: - # Scale, zero point and quantized data will be stored together. - # The layout for per token per head: - # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501 - # so, we have to extend head_size by 8, which is sizeof(float) - # for scale and sizeof(float) for zeropoint - head_size += 8 - - key_cache_block = block_size * num_kv_heads * head_size - value_cache_block = key_cache_block - total = num_layers * (key_cache_block + value_cache_block) - dtype_size = cache_dtype.size - return dtype_size * total - - -class OpenVINOWorker(LoRANotSupportedWorkerBase): - """A worker class that executes the model on OpenVINO backend. - - Each worker is associated with a single OpenVINO device. The worker is - responsible for maintaining the KV cache and executing the model on the - OpenVINO backend. - """ - - def __init__( - self, - vllm_config: VllmConfig, - local_rank: int, - rank: int, - distributed_init_method: str, - is_driver_worker: bool = False, - ) -> None: - WorkerBase.__init__(self, vllm_config) - self.ov_core = ov.Core() - self.parallel_config.rank = rank - self.local_rank = local_rank - self.rank = rank - self.distributed_init_method = distributed_init_method - self.is_driver_worker = is_driver_worker - if self.is_driver_worker: - assert self.rank == 0, "The driver worker must have rank 0." - - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils import init_cached_hf_modules - - init_cached_hf_modules() - self.model_runner = OpenVINOModelRunner( - self.ov_core, - vllm_config=self.vllm_config, - kv_cache_dtype=self.vllm_config.cache_config.cache_dtype, - is_driver_worker=is_driver_worker, - ) - # Uninitialized cache engine. Will be initialized by - # initialize_cache. - self.cache_engine: OpenVINOCacheEngine - self.kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] - - def init_device(self) -> None: - self.init_distributed_environment() - # Set random seed. - set_random_seed(self.model_config.seed) - - def load_model(self): - self.model_runner.load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of blocks available for the KV cache. - - This determines how many KV blocks can fit into the configured - KV cache space. - """ - # For OpenVINO backend, in case of CPU device, the block number will be - # calculated based on the openvino_kvcache_space_bytes. - cache_block_size = self.get_cache_block_size_bytes() - kvcache_space_bytes = self.cache_config.openvino_kvcache_space_bytes - - if current_platform.is_openvino_cpu(): - num_device_blocks = int(kvcache_space_bytes // cache_block_size) - num_swap_blocks = 0 - else: - if kvcache_space_bytes > 0: - logger.info("KV_CACHE size was explicitly configured via " - "VLLM_OPENVINO_KVCACHE_SPACE environment " - "variable, ignoring profiling run.") - kv_cache_size = kvcache_space_bytes - else: - try: - kv_cache_size = self.profile_run() - except Exception as err: - raise RuntimeError( - "The error occurred during profile run. This might be " - "due to insufficient GPU memory. Consider decreasing " - "`max_model_len` to limit the maximum simultaneously " - "processed tokens.") from err - - num_device_blocks = int(kv_cache_size // cache_block_size) - num_swap_blocks = int(self.cache_config.swap_space_bytes // - cache_block_size) - - return num_device_blocks, num_swap_blocks - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the KV cache. Swappable CPU memory is only - supported on GPU. - - For CPU, we use the num_gpu_blocks to - determine how many non-swappable CPU blocks to allocate. - """ - - num_device_blocks = num_gpu_blocks - num_swap_blocks = num_cpu_blocks - - if current_platform.is_openvino_cpu(): - assert (num_swap_blocks == 0 - ), f"{type(self)} does not support swappable cache for CPU" - - self._validate_num_blocks(num_device_blocks) - self.cache_config.num_gpu_blocks = num_device_blocks - self.cache_config.num_cpu_blocks = num_swap_blocks - - # Initialize the cache. - self._init_cache_engine() - - def _validate_num_blocks(self, num_blocks: int) -> None: - """Raise errors if the num_blocks is invalid.""" - if num_blocks <= 0: - raise ValueError( - "No available memory for the cache blocks. " - "Try increasing `VLLM_OPENVINO_KVCACHE_SPACE` when " - "initializing the engine.") - - max_seq_len = self.cache_config.block_size * num_blocks - if self.model_config.max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({self.model_config.max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`VLLM_OPENVINO_KVCACHE_SPACE` or decreasing `max_model_len` " - "when initializing the engine.") - - def _init_cache_engine(self) -> None: - ov_device = envs.VLLM_OPENVINO_DEVICE - self.cache_engine = OpenVINOCacheEngine( - self.cache_config, - self.model_config, - self.parallel_config, - self.device_config, - self.ov_core, - ov_device, - ) - self.kv_cache = self.cache_engine.kv_cache - bind_kv_cache(self.compilation_config.static_forward_context, - [self.kv_cache]) - self.model_runner.block_size = self.cache_engine.block_size - - assert self.kv_cache is not None - - # Populate the cache to warmup the memory - if current_platform.is_openvino_cpu(): - for key_cache, value_cache in self.kv_cache: - key_cache.data[:] = 0 - value_cache.data[:] = 0 - - def cache_swap_in(self, src_to_dst: List[Tuple[int, int]]) -> None: - self.cache_engine.swap_in(src_to_dst) - - def cache_swap_out(self, src_to_dst: List[Tuple[int, int]]) -> None: - self.cache_engine.swap_out(src_to_dst) - - def cache_copy( - self, - blocks_to_copy: List[Tuple[int, int]], - ) -> None: - self.cache_engine.copy(blocks_to_copy) # type: ignore - - def get_model(self) -> nn.Module: - return self.model_runner.get_model() - - @torch.inference_mode() - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None, - ) -> List[SamplerOutput]: - if execute_model_req is None: - seq_group_metadata_list = None - else: - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - - if self.is_driver_worker: - assert seq_group_metadata_list is not None - num_seq_groups: int = len(seq_group_metadata_list) - assert execute_model_req is not None - blocks_to_copy = execute_model_req.blocks_to_copy - blocks_to_swap_in = execute_model_req.blocks_to_swap_in - blocks_to_swap_out = execute_model_req.blocks_to_swap_out - data: Dict[str, Any] = { - "num_seq_groups": num_seq_groups, - "blocks_to_copy": execute_model_req.blocks_to_copy, - "blocks_to_swap_in": execute_model_req.blocks_to_swap_in, - "blocks_to_swap_out": execute_model_req.blocks_to_swap_out, - } - broadcast_tensor_dict(data, src=0) - else: - data = broadcast_tensor_dict(src=0) - num_seq_groups = data["num_seq_groups"] - blocks_to_copy = data["blocks_to_copy"] - blocks_to_swap_in = data["blocks_to_swap_in"] - blocks_to_swap_out = data["blocks_to_swap_out"] - - if current_platform.is_openvino_cpu(): - assert len(execute_model_req.blocks_to_swap_in) == 0 - assert len(execute_model_req.blocks_to_swap_out) == 0 - else: - self.cache_swap_in(blocks_to_swap_in) - self.cache_swap_out(blocks_to_swap_out) - - self.cache_copy(blocks_to_copy) - - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return [] - - output = self.model_runner.execute_model(seq_group_metadata_list, - self.kv_cache) - - # OpenVINO worker only supports single-step execution. - return [output] - - def init_distributed_environment(self) -> None: - """Initialize the distributed environment.""" - - parallel_config = self.parallel_config - rank = self.rank - distributed_init_method = self.distributed_init_method - init_distributed_environment( - world_size=parallel_config.world_size, - rank=rank, - distributed_init_method=distributed_init_method, - backend="gloo", - ) - - # A small all_reduce for warmup. - torch.distributed.all_reduce(torch.zeros(1).cpu()) - - ensure_model_parallel_initialized( - parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - ) - - def get_cache_block_size_bytes(self) -> int: - """Return the size in bytes of a single KV cache block.""" - return OpenVINOCacheEngine.get_cache_block_size( - self.cache_config.block_size, - self.cache_config.cache_dtype, - self.model_config, - self.parallel_config, - ) - - def profile_run(self) -> int: - ov_device = envs.VLLM_OPENVINO_DEVICE - - assert not current_platform.is_openvino_cpu(), \ - "CPU device isn't supposed to use profile run." - - import openvino.properties.device as device - import openvino.properties.intel_gpu as intel_gpu - - ov_core = self.ov_core - cache_config = self.cache_config - model_config = self.model_config - parallel_config = self.parallel_config - device_config = self.device_config - input_registry = INPUT_REGISTRY - mm_registry = MULTIMODAL_REGISTRY - mm_registry.init_mm_limits_per_prompt(model_config) - - # Execute a forward pass with dummy inputs to profile the memory usage - # of the model. - def model_profile_run(): - top_k = model_config.get_vocab_size() - 1 - sampling_params = SamplingParams(top_p=0.99, top_k=top_k) - - max_num_batched_tokens = \ - self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs - tmp_cache_config = CacheConfig(cache_config.block_size, - cache_config.gpu_memory_utilization, - cache_config.swap_space_bytes, - "auto") - tmp_cache_config.num_gpu_blocks = 1 - tmp_cache_config.num_cpu_blocks = 0 - tmp_cache_config.cache_dtype = cache_config.cache_dtype - - profiling_cache_engine = OpenVINOCacheEngine( - tmp_cache_config, model_config, parallel_config, device_config, - ov_core, ov_device) - - # Profile memory usage with max_num_sequences sequences and the - # total # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) - block_size = cache_config.block_size - seq_num_blocks = (seq_len + block_size - 1) // block_size - - dummy_data = input_registry \ - .dummy_data_for_profiling(model_config, - seq_len, - mm_registry) - - block_tables = [[0] * seq_num_blocks] * max_num_seqs - seq = SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=True, - seq_data={group_id: dummy_data.seq_data}, - sampling_params=sampling_params, - block_tables=block_tables, - lora_request=None, - multi_modal_data=dummy_data.multi_modal_data) - seqs.append(seq) - - self.model_runner.block_size = tmp_cache_config.block_size - - bind_kv_cache(self.compilation_config.static_forward_context, - profiling_cache_engine.kv_cache) - # Run the model with the dummy inputs. - self.model_runner.execute_model(seqs, - profiling_cache_engine.kv_cache) - - # Explicitly revert bind_kv_cache and delete temporary KV cache - # manager to free KV cache when real inputs will be passed to OV - bind_kv_cache(self.compilation_config.static_forward_context, [[ - torch.tensor([]) - for _ in range(len(profiling_cache_engine.kv_cache)) - ]]) - del profiling_cache_engine - - logger.info( - "Start profiling run with dummy inputs to evaluate " - "memory usage for %s. It might take a while.", ov_device) - - model_profile_run() - - gpu_device_type = ov_core.get_property(ov_device, device.type) - memory_statistics = \ - ov_core.get_property(ov_device, intel_gpu.memory_statistics) - memory_utilization = cache_config.gpu_memory_utilization - - if gpu_device_type == device.Type.INTEGRATED and \ - memory_utilization >= 0.9: - logger.warning( - "iGPU is used with high gpu_memory_utilization=%f " - "value. This may cause low performance due to " - "occupying the majority of available system " - "memory. Please consider decreasing " - "gpu_memory_utilization or explicitly setting " - "`VLLM_OPENVINO_KVCACHE_SPACE` (GB) environment " - "variable.", memory_utilization) - - # sum up all used device memory - device_memory_types = ["cl_mem", "usm_device"] - used_device_mem = \ - sum(memory_statistics.get(key, 0) for key in device_memory_types) - - if gpu_device_type == device.Type.INTEGRATED: - used_device_mem += memory_statistics.get("usm_host", 0) - - # there could be unaccounted extra memory reserved by kernels, kept - # in memory pools, etc - # therefore, add a threshold to account for this - used_memory_threshold = 1.1 - used_device_mem *= used_memory_threshold - - total_device_memory = \ - ov_core.get_property(ov_device, intel_gpu.device_total_mem_size) - - def format_memory_size(size) -> str: - units = ["B", "KB", "MB", "GB"] - unit_index = 0 - - while size > 1024 and unit_index < len(units) - 1: - size /= 1024 - unit_index += 1 - - return f"{size:.2f} {units[unit_index]}" - - total_device_memory_str = \ - format(format_memory_size(total_device_memory)) - used_device_memory_str = \ - format(format_memory_size(used_device_mem)) - - logger.info( - "Total %s memory: %s. " - "Amount of memory required to run the model with " - "max_num_batched_tokens=%d: %s.", ov_device, - total_device_memory_str, - self.scheduler_config.max_num_batched_tokens, - used_device_memory_str) - - if used_device_mem >= total_device_memory: - raise RuntimeError( - f"The required memory size {used_device_memory_str} for model " - "is higher than the total available device " - "memory {total_device_memory_str}. Please consider to " - "decrease `max_num_batched_tokens` or increase " - "`gpu_memory_utilization`") - - return total_device_memory * memory_utilization - used_device_mem