From 38db529f66712502a3cf93488229fc9fd2dc76fc Mon Sep 17 00:00:00 2001 From: Aziz Date: Thu, 18 Sep 2025 21:18:56 +0200 Subject: [PATCH 1/6] [feat]: Create interface for model-specific M-RoPE (#24194) Signed-off-by: AzizCode92 Signed-off-by: Aziz Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/model_executor/models/__init__.py | 11 ++- vllm/model_executor/models/interfaces.py | 68 +++++++++++++ vllm/model_executor/models/qwen2_vl.py | 118 ++++++++++++++++++++++- vllm/v1/worker/gpu_model_runner.py | 33 +++++-- vllm/worker/model_runner.py | 42 +++++--- 5 files changed, 242 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index d3ee6872dd8bf..4ccba64f2c110 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal, - SupportsPP, SupportsTranscription, SupportsV0Only, - has_inner_state, supports_lora, supports_multimodal, - supports_pp, supports_transcription, supports_v0_only) +from .interfaces import (HasInnerState, SupportsLoRA, SupportsMRoPE, + SupportsMultiModal, SupportsPP, SupportsTranscription, + SupportsV0Only, has_inner_state, supports_lora, + supports_mrope, supports_multimodal, supports_pp, + supports_transcription, supports_v0_only) from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration, is_pooling_model, is_text_generation_model) from .registry import ModelRegistry @@ -21,6 +22,8 @@ __all__ = [ "supports_lora", "SupportsMultiModal", "supports_multimodal", + "SupportsMRoPE", + "supports_mrope", "SupportsPP", "supports_pp", "SupportsTranscription", diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 8f8e300c84d71..e9c600e36cfa7 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -8,6 +8,7 @@ from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, import numpy as np import torch from torch import Tensor +from transformers import PretrainedConfig from transformers.models.whisper.tokenization_whisper import LANGUAGES from typing_extensions import Self, TypeIs @@ -852,3 +853,70 @@ def supports_eagle3( model: Union[type[object], object], ) -> Union[TypeIs[type[SupportsEagle3]], TypeIs[SupportsEagle3]]: return isinstance(model, SupportsEagle3) + + +@runtime_checkable +class SupportsMRoPE(Protocol): + """The interface required for all models that support M-RoPE.""" + + supports_mrope: ClassVar[Literal[True]] = True + """ + A flag that indicates this model supports M-RoPE. + + Note: + There is no need to redefine this flag if this class is in the + MRO of your model class. + """ + + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + second_per_grid_ts: Optional[list[float]] = None, + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """ + Get M-RoPE input positions and delta value for this specific model. + + This method should be implemented by each model that supports M-RoPE + to provide model-specific logic for computing input positions. + + Args: + input_tokens: List of input token IDs + hf_config: HuggingFace model configuration + image_grid_thw: Image grid dimensions (t, h, w) + video_grid_thw: Video grid dimensions (t, h, w) + second_per_grid_ts: Seconds per grid timestep for videos + context_len: Context length + seq_len: Sequence length + audio_feature_lengths: Audio feature lengths for multimodal models + use_audio_in_video: Whether to use audio in video for interleaving + + Returns: + Tuple of (llm_positions, mrope_position_delta) + - llm_positions: Tensor of shape [3, num_tokens] + with T/H/W positions + - mrope_position_delta: Delta for position calculations + """ + ... + + +@overload +def supports_mrope(model: type[object]) -> TypeIs[type[SupportsMRoPE]]: + ... + + +@overload +def supports_mrope(model: object) -> TypeIs[SupportsMRoPE]: + ... + + +def supports_mrope( + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsMRoPE]], TypeIs[SupportsMRoPE]]: + return isinstance(model, SupportsMRoPE) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index b6576b783b64a..7f361678ba72e 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -32,7 +32,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from transformers import AutoConfig, BatchFeature +from transformers import AutoConfig, BatchFeature, PretrainedConfig from transformers.models.qwen2_vl import (Qwen2VLImageProcessor, Qwen2VLProcessor) from transformers.models.qwen2_vl.configuration_qwen2_vl import ( @@ -73,7 +73,7 @@ from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils.tensor_schema import TensorSchema, TensorShape -from .interfaces import (MultiModalEmbeddings, SupportsLoRA, +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMRoPE, SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, @@ -1096,7 +1096,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] info=Qwen2VLProcessingInfo, dummy_inputs=Qwen2VLDummyInputsBuilder) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsLoRA, SupportsPP): + SupportsLoRA, SupportsPP, SupportsMRoPE): # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( @@ -1109,6 +1109,118 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, "model.": "language_model.model.", }) + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]], + second_per_grid_ts: Optional[list[float]] = None, + context_len: int = 0, + seq_len: Optional[int] = None, + audio_feature_lengths: Optional[torch.Tensor] = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get M-RoPE input positions for Qwen2-VL model.""" + if image_grid_thw is None: + image_grid_thw = [] + if video_grid_thw is None: + video_grid_thw = [] + if second_per_grid_ts is None: + second_per_grid_ts = [] + + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + vision_start_token_id = hf_config.vision_start_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + tokens_per_second = getattr(hf_config.vision_config, + "tokens_per_second", 1.0) + + input_tokens_tensor = torch.tensor(input_tokens) + vision_start_indices = torch.argwhere( + input_tokens_tensor == vision_start_token_id).squeeze(1) + vision_tokens = input_tokens_tensor[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_videos = image_nums, video_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + video_nums): + video_second_per_grid_t = 0.0 + if remain_images > 0: + try: + ed_image = input_tokens.index(image_token_id, st) + except ValueError: + ed_image = len(input_tokens) + 1 + else: + ed_image = len(input_tokens) + 1 + if remain_videos > 0: + try: + ed_video = input_tokens.index(video_token_id, st) + except ValueError: + ed_video = len(input_tokens) + 1 + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_second_per_grid_t = 1.0 + if second_per_grid_ts: + video_second_per_grid_t = second_per_grid_ts[video_index] + video_index += 1 + remain_videos -= 1 + ed = ed_video + + llm_grid_t, llm_grid_h, llm_grid_w = \ + t, h // spatial_merge_size, w // spatial_merge_size + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + t_index = (torch.arange(llm_grid_t).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t * + tokens_per_second).long().flatten() + + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( + llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( + llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - + len(input_tokens)).item() + llm_positions = llm_positions[:, context_len:seq_len] + + return llm_positions, mrope_position_delta + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4873b586724ec..053e8f0537ed9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -42,6 +42,7 @@ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.model_executor.models.interfaces import (is_mixture_of_experts, supports_eagle3, + supports_mrope, supports_transcription) from vllm.model_executor.models.interfaces_base import ( VllmModelForPooling, is_pooling_model, is_text_generation_model) @@ -730,16 +731,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if mm_input.get("use_audio_in_video") is True: use_audio_in_video = True - req_state.mrope_positions, req_state.mrope_position_delta = \ - MRotaryEmbedding.get_input_positions_tensor( - req_state.prompt_token_ids, - hf_config=self.model_config.hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) + if supports_mrope(self.model): + req_state.mrope_positions, req_state.mrope_position_delta = \ + self.model.get_mrope_input_positions( + req_state.prompt_token_ids, + hf_config=self.model_config.hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + else: + req_state.mrope_positions, req_state.mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + req_state.prompt_token_ids, + hf_config=self.model_config.hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) def _extract_mm_kwargs( self, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 88f83c9dd7e6c..594382650e3c1 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -41,7 +41,8 @@ from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, get_sampler) from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader.tensorizer import TensorizerConfig -from vllm.model_executor.models import supports_lora, supports_multimodal +from vllm.model_executor.models import (supports_lora, supports_mrope, + supports_multimodal) from vllm.model_executor.models.utils import set_cpu_offload_max_bytes from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalKwargs, MultiModalPlaceholderMap, @@ -670,18 +671,33 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): inter_data.seq_ids[seq_idx]] token_ids = seq_data.get_token_ids() - mrope_input_positions, mrope_position_delta = \ - MRotaryEmbedding.get_input_positions( - token_ids, - hf_config=hf_config, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - second_per_grid_ts=second_per_grid_ts, - context_len=inter_data.context_lens[seq_idx], - seq_len=inter_data.seq_lens[seq_idx], - audio_feature_lengths=audio_feature_lengths, - use_audio_in_video=use_audio_in_video, - ) + if supports_mrope(self.runner.model): + mrope_input_positions, mrope_position_delta = \ + self.runner.model.get_mrope_input_positions( + token_ids, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=inter_data.context_lens[seq_idx], + seq_len=inter_data.seq_lens[seq_idx], + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + mrope_input_positions = mrope_input_positions.tolist() + else: + mrope_input_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions( + token_ids, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + context_len=inter_data.context_lens[seq_idx], + seq_len=inter_data.seq_lens[seq_idx], + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) seq_data.mrope_position_delta = mrope_position_delta inter_data.mrope_input_positions[ From 75fb112d80f680624dc99a00e02be6a45661f948 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:32:24 -0400 Subject: [PATCH 2/6] [Bug] Fix `returned_lse` not Defined issue (#25106) Signed-off-by: yewentao256 Co-authored-by: Tyler Michael Smith --- vllm/v1/attention/backends/mla/cutlass_mla.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 21be17a750df4..ae534f3207b51 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -206,12 +206,11 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): ) if H < MAX_HEADS: - # Extract the subsets of the outputs - returned_lse = lse[:, :H].contiguous( - ) if self.need_to_return_lse_for_decode else lse out = out[:, :H] + if self.need_to_return_lse_for_decode: + lse = lse[:, :H].contiguous() - return out, returned_lse + return out, lse def _forward_decode( self, From d2a30a2d933226d3951ad98cb5de0c74e2e64826 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:38:37 -0400 Subject: [PATCH 3/6] [Bug] Fix torch Compilation Cache Hit Error (#25093) Signed-off-by: yewentao256 --- vllm/config/compilation.py | 12 ------------ vllm/platforms/cuda.py | 17 ++++++++++------- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index f8ccc20222615..3618f472e742d 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -563,18 +563,6 @@ class CompilationConfig: self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] - if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput": - # exclude MoE dispatch/combine from capture by ensuring - # piecewise splitting includes them, so communication remains - # outside CUDA graphs while compute can still be graphed. - moe_ops = [ - "vllm.moe_forward", - "vllm.moe_forward_shared", - ] - for op in moe_ops: - if op not in self.splitting_ops: - self.splitting_ops.append(op) - def splitting_ops_contain_attention(self) -> bool: return self.splitting_ops is not None and all( op in self.splitting_ops for op in self._attention_ops) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 8e3436a9e73c5..87d8f2b7481bb 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -191,14 +191,17 @@ class CudaPlatformBase(Platform): compilation_config = vllm_config.compilation_config if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" and parallel_config.data_parallel_size > 1 - and compilation_config.cudagraph_mode - not in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE]): + and compilation_config.cudagraph_mode != CUDAGraphMode.NONE): + # TODO: Piecewise Cuda graph might be enabled + # if torch compile cache key issue fixed + # See https://github.com/vllm-project/vllm/pull/25093 logger.info( - "Data Parallel with DeepEP high-throughput: using PIECEWISE " - "CUDA graphs and excluding MoE ops from capture. Set " - "VLLM_ALL2ALL_BACKEND=deepep_low_latency if you need MoE " - "graphs captured as well.") - compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + "Data Parallel: disabling cudagraphs since DP " + "with DeepEP high-throughput kernels are not CUDA Graph " + "compatible. The DeepEP low-latency kernels are CUDA Graph " + "compatible. Set the all_to_all backend to deepep_low_latency " + "to use those kernels instead.") + compilation_config.cudagraph_mode = CUDAGraphMode.NONE @classmethod def get_current_memory_usage(cls, From 1c3dad22ff92cbf84e0fa8ad1643c560a07944ea Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 18 Sep 2025 13:35:21 -0700 Subject: [PATCH 4/6] [V0 Deprecation] Remove unused async_timeout.py (#25190) Signed-off-by: Woosuk Kwon --- vllm/engine/async_timeout.py | 173 ----------------------------------- 1 file changed, 173 deletions(-) delete mode 100644 vllm/engine/async_timeout.py diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py deleted file mode 100644 index 3b9c055160c1b..0000000000000 --- a/vllm/engine/async_timeout.py +++ /dev/null @@ -1,173 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Workaround for https://github.com/python/cpython/issues/86296 -# -# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py -# Licensed under the Apache License (Apache-2.0) - -import asyncio -import enum -import sys -from types import TracebackType -from typing import Any, Optional, Type - -if sys.version_info[:2] >= (3, 11): - from asyncio import timeout as asyncio_timeout -else: - - class _State(enum.Enum): - INIT = "INIT" - ENTER = "ENTER" - TIMEOUT = "TIMEOUT" - EXIT = "EXIT" - - class Timeout: - # Internal class, please don't instantiate it directly - # Use timeout() and timeout_at() public factories instead. - # - # Implementation note: `async with timeout()` is preferred - # over `with timeout()`. - # While technically the Timeout class implementation - # doesn't need to be async at all, - # the `async with` statement explicitly points that - # the context manager should be used from async function context. - # - # This design allows to avoid many silly misusages. - # - # TimeoutError is raised immediately when scheduled - # if the deadline is passed. - # The purpose is to time out as soon as possible - # without waiting for the next await expression. - - __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler") - - def __init__(self, deadline: Optional[float], - loop: asyncio.AbstractEventLoop) -> None: - self._loop = loop - self._state = _State.INIT - - self._timeout_handler = None # type: Optional[asyncio.Handle] - if deadline is None: - self._deadline = None # type: Optional[float] - else: - self.update(deadline) - - async def __aenter__(self) -> "Timeout": - self._do_enter() - return self - - async def __aexit__( - self, - exc_type: Optional[Type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[TracebackType], - ) -> Optional[bool]: - self._do_exit(exc_type) - return None - - @property - def expired(self) -> bool: - """Is timeout expired during execution?""" - return self._state == _State.TIMEOUT - - @property - def deadline(self) -> Optional[float]: - return self._deadline - - def reject(self) -> None: - """Reject scheduled timeout if any.""" - # cancel is maybe better name but - # task.cancel() raises CancelledError in asyncio world. - if self._state not in (_State.INIT, _State.ENTER): - raise RuntimeError(f"invalid state {self._state.value}") - self._reject() - - def _reject(self) -> None: - if self._timeout_handler is not None: - self._timeout_handler.cancel() - self._timeout_handler = None - - def shift(self, delay: float) -> None: - """Advance timeout on delay seconds. - The delay can be negative. - Raise RuntimeError if shift is called when deadline is not scheduled - """ - deadline = self._deadline - if deadline is None: - raise RuntimeError( - "cannot shift timeout if deadline is not scheduled") - self.update(deadline + delay) - - def update(self, deadline: float) -> None: - """Set deadline to absolute value. - deadline argument points on the time in the same clock system - as loop.time(). - If new deadline is in the past the timeout is raised immediately. - Please note: it is not POSIX time but a time with - undefined starting base, e.g. the time of the system power on. - """ - if self._state == _State.EXIT: - raise RuntimeError( - "cannot reschedule after exit from context manager") - if self._state == _State.TIMEOUT: - raise RuntimeError("cannot reschedule expired timeout") - if self._timeout_handler is not None: - self._timeout_handler.cancel() - self._deadline = deadline - if self._state != _State.INIT: - self._reschedule() - - def _reschedule(self) -> None: - assert self._state == _State.ENTER - deadline = self._deadline - if deadline is None: - return - - now = self._loop.time() - if self._timeout_handler is not None: - self._timeout_handler.cancel() - - task = asyncio.current_task() - if deadline <= now: - self._timeout_handler = self._loop.call_soon( - self._on_timeout, task) - else: - self._timeout_handler = self._loop.call_at( - deadline, self._on_timeout, task) - - def _do_enter(self) -> None: - if self._state != _State.INIT: - raise RuntimeError(f"invalid state {self._state.value}") - self._state = _State.ENTER - self._reschedule() - - def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None: - if exc_type is asyncio.CancelledError and \ - self._state == _State.TIMEOUT: - self._timeout_handler = None - raise asyncio.TimeoutError - # timeout has not expired - self._state = _State.EXIT - self._reject() - return None - - def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None: - if task: - task.cancel() - self._state = _State.TIMEOUT - # drop the reference early - self._timeout_handler = None - - def asyncio_timeout(delay: Optional[float]) -> Timeout: - """timeout context manager. - Useful in cases when you want to apply timeout logic around block - of code or in cases when asyncio.wait_for is not suitable. For example: - >>> async with timeout(0.001): - ... async with aiohttp.get('https://github.com') as r: - ... await r.text() - delay - value in seconds or None to disable timeout logic - """ - loop = asyncio.get_running_loop() - deadline = loop.time() + delay if delay is not None else None - return Timeout(deadline, loop) From a53ad626d629e79264f0a6ab6820a4b547f3b1c4 Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Thu, 18 Sep 2025 23:53:52 +0300 Subject: [PATCH 5/6] [KV offload][1b/N] rename offloading to kv_offload (#25191) Signed-off-by: Or Ozeri --- .buildkite/test-pipeline.yaml | 2 +- tests/v1/{offloading => kv_offload}/test_worker.py | 4 ++-- vllm/v1/{offloading => kv_offload}/abstract.py | 0 vllm/v1/{offloading => kv_offload}/mediums.py | 2 +- vllm/v1/{offloading => kv_offload}/worker/worker.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename tests/v1/{offloading => kv_offload}/test_worker.py (97%) rename vllm/v1/{offloading => kv_offload}/abstract.py (100%) rename vllm/v1/{offloading => kv_offload}/mediums.py (93%) rename vllm/v1/{offloading => kv_offload}/worker/worker.py (98%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5fd08296625ad..c42ec4f2503d0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -280,7 +280,7 @@ steps: # split the test to avoid interference - pytest -v -s v1/core - pytest -v -s v1/executor - - pytest -v -s v1/offloading + - pytest -v -s v1/kv_offload - pytest -v -s v1/sample - pytest -v -s v1/logits_processors - pytest -v -s v1/worker diff --git a/tests/v1/offloading/test_worker.py b/tests/v1/kv_offload/test_worker.py similarity index 97% rename from tests/v1/offloading/test_worker.py rename to tests/v1/kv_offload/test_worker.py index 2391b565773aa..6cf8aa0875d62 100644 --- a/tests/v1/offloading/test_worker.py +++ b/tests/v1/kv_offload/test_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.v1.offloading.abstract import LoadStoreSpec -from vllm.v1.offloading.worker.worker import (OffloadingHandler, +from vllm.v1.kv_offload.abstract import LoadStoreSpec +from vllm.v1.kv_offload.worker.worker import (OffloadingHandler, OffloadingWorker, TransferResult, TransferSpec) diff --git a/vllm/v1/offloading/abstract.py b/vllm/v1/kv_offload/abstract.py similarity index 100% rename from vllm/v1/offloading/abstract.py rename to vllm/v1/kv_offload/abstract.py diff --git a/vllm/v1/offloading/mediums.py b/vllm/v1/kv_offload/mediums.py similarity index 93% rename from vllm/v1/offloading/mediums.py rename to vllm/v1/kv_offload/mediums.py index 5a1887848c9fc..8962819178459 100644 --- a/vllm/v1/offloading/mediums.py +++ b/vllm/v1/kv_offload/mediums.py @@ -4,7 +4,7 @@ from abc import ABC import numpy as np -from vllm.v1.offloading.abstract import LoadStoreSpec +from vllm.v1.kv_offload.abstract import LoadStoreSpec class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC): diff --git a/vllm/v1/offloading/worker/worker.py b/vllm/v1/kv_offload/worker/worker.py similarity index 98% rename from vllm/v1/offloading/worker/worker.py rename to vllm/v1/kv_offload/worker/worker.py index d2c2045d1f1f6..b7a52a088fb90 100644 --- a/vllm/v1/offloading/worker/worker.py +++ b/vllm/v1/kv_offload/worker/worker.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from vllm.logger import init_logger -from vllm.v1.offloading.abstract import LoadStoreSpec +from vllm.v1.kv_offload.abstract import LoadStoreSpec # a single transfer spec (src_blocks_spec, dst_blocks_spec) TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec] From 9fac6aa30b669de75d8718164cd99676d3530e7d Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 18 Sep 2025 17:26:28 -0400 Subject: [PATCH 6/6] [BugFix] Fix DeepGEMM warmup, no m.weight_scale_inv (#25206) Signed-off-by: Lucas Wilkinson --- vllm/model_executor/warmup/deep_gemm_warmup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index a636a714145cf..4d1829cd228cd 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -36,7 +36,7 @@ def _extract_data_from_linear_base_module( assert m.quant_method.quant_config is not None w = m.weight - ws = m.weight_scale_inv + ws = m.weight_scale quant_block_size = m.quant_method.quant_config.weight_block_size assert isinstance(w, torch.Tensor)