From 55602bb2e695108e35501dfcd5890e7664c31495 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 1 Sep 2025 16:50:25 +0800 Subject: [PATCH 01/95] [Frontend] Update the warning log when using VLLM_ALLOW_LONG_MAX_MODEL_LEN (#20904) Signed-off-by: wang.yuqi Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/__init__.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 484f3986bb301..f53e8b0308853 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3021,16 +3021,20 @@ def _get_and_verify_max_len( f"User-specified max_model_len ({max_model_len}) is greater " f"than the derived max_model_len ({max_len_key}=" f"{derived_max_model_len} or model_max_length=" - f"{model_max_length} in model's config.json). This may lead " - "to incorrect model outputs or CUDA errors.") + f"{model_max_length} in model's config.json).") + warning = ( + "VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme " + "caution. If the model uses relative position encoding (RoPE), " + "positions exceeding derived_max_model_len lead to nan. If the " + "model uses absolute position encoding, positions exceeding " + "derived_max_model_len will cause a CUDA array out-of-bounds " + "error.") if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN: - logger.warning( - "%s Make sure the value is correct and within the " - "model context size.", msg) + logger.warning_once("%s %s", msg, warning) else: raise ValueError( f"{msg} To allow overriding this maximum, set " - "the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1") + f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}") return int(max_model_len) From dc1a53186d8dbb3b450174a38a10c474b55f22e9 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 1 Sep 2025 17:38:04 +0800 Subject: [PATCH 02/95] [Kernel] Update DeepGEMM to latest commit (#23915) Signed-off-by: Jee Jee Li Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- docker/Dockerfile | 5 ++--- tools/install_deepgemm.sh | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 2e272cbca8417..75e8fa49f86c9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -432,11 +432,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # Install DeepGEMM from source -ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c" +ARG DEEPGEMM_GIT_REF COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh RUN --mount=type=cache,target=/root/.cache/uv \ - VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "${DEEPGEMM_GIT_REF}" \ - && rm /tmp/install_deepgemm.sh + VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} # Install EP kernels(pplx-kernels and DeepEP), NixL COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh diff --git a/tools/install_deepgemm.sh b/tools/install_deepgemm.sh index 33849581d2c0e..b125cda96f179 100755 --- a/tools/install_deepgemm.sh +++ b/tools/install_deepgemm.sh @@ -6,7 +6,7 @@ set -e # Default values DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git" -DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c" +DEEPGEMM_GIT_REF="ea9c5d9270226c5dd7a577c212e9ea385f6ef048" # Parse command line arguments while [[ $# -gt 0 ]]; do From 107284959ac14d42aba09a03220ff21da1eafcb2 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Mon, 1 Sep 2025 11:38:20 +0200 Subject: [PATCH 03/95] [Doc]: fix typos in Python comments (#24026) Signed-off-by: Didier Durand --- examples/offline_inference/multilora_inference.py | 2 +- vllm/distributed/device_communicators/pynccl.py | 2 +- vllm/distributed/parallel_state.py | 4 ++-- .../entrypoints/openai/tool_parsers/pythonic_tool_parser.py | 2 +- vllm/model_executor/layers/fused_moe/moe_pallas.py | 2 +- vllm/model_executor/models/ovis.py | 2 +- vllm/model_executor/models/phi4mm_audio.py | 6 +++--- vllm/model_executor/models/phi4mm_utils.py | 2 +- vllm/third_party/pynvml.py | 2 +- vllm/transformers_utils/configs/nemotron.py | 2 +- vllm/transformers_utils/configs/nemotron_h.py | 2 +- vllm/transformers_utils/processors/ovis.py | 2 +- vllm/transformers_utils/processors/ovis2_5.py | 2 +- vllm/v1/spec_decode/ngram_proposer.py | 2 +- 14 files changed, 17 insertions(+), 17 deletions(-) diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index f0c00bcaaeb11..6040683c68bcd 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -23,7 +23,7 @@ def create_test_prompts( 2 requests for base model, 4 requests for the LoRA. We define 2 different LoRA adapters (using the same model for demo purposes). Since we also set `max_loras=1`, the expectation is that the requests - with the second LoRA adapter will be ran after all requests with the + with the second LoRA adapter will be run after all requests with the first adapter have finished. """ return [ diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 502bfd39005ad..3e4d0d250af94 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -31,7 +31,7 @@ class PyNcclCommunicator: group: the process group to work on. If None, it will use the default process group. device: the device to bind the PyNcclCommunicator to. If None, - it will be bind to f"cuda:{local_rank}". + it will be bound to f"cuda:{local_rank}". library_path: the path to the NCCL library. If None, it will use the default library path. It is the caller's responsibility to make sure each communicator diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index b89aee99c8d46..fc96c2ac926b0 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -939,8 +939,8 @@ def get_pipeline_model_parallel_group(): def graph_capture(device: torch.device): """ `graph_capture` is a context manager which should surround the code that - is capturing the CUDA graph. Its main purpose is to ensure that the - some operations will be run after the graph is captured, before the graph + is capturing the CUDA graph. Its main purpose is to ensure that some + operations will be run after the graph is captured, before the graph is replayed. It returns a `GraphCaptureContext` object which contains the necessary data for the graph capture. Currently, it only contains the stream that the graph capture is running on. This stream is set to the diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index 73329cdf701d6..992f141bef0f2 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -165,7 +165,7 @@ class PythonicToolParser(ToolParser): index] += delta.function.arguments # HACK: serving_chat.py inspects the internal state of tool parsers - # when determining it's final streaming delta, automatically + # when determining its final streaming delta, automatically # adding autocompleted JSON. # These two lines avoid that nonsense while ensuring finish_reason # is set to tool_calls when at least one tool is called. diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py index 582ae3e12c289..23f618b1a5fd7 100644 --- a/vllm/model_executor/layers/fused_moe/moe_pallas.py +++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py @@ -7,7 +7,7 @@ import torch.nn.functional as F def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor: """ - Compute the histogram of a int32 tensor. The bin edges are defined by the + Compute the histogram of an int32 tensor. The bin edges are defined by the min and max values, with step = 1. """ assert input.dtype == torch.int32, "input must be of torch.int32 dtype." diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 04a06e5f9d600..41fd272397e64 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -544,7 +544,7 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP): vision_embeddings) input_ids = None - # up until here we have a inputs_embeds 100% numerical identity + # up until here we have an inputs_embeds 100% numerical identity # between the OG HF Transformers implementation and ours hidden_states = self.llm( input_ids=input_ids, diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index 0b0d66ae771dd..b5e4d727bf210 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -43,7 +43,7 @@ class ConformerEncoderLayer(nn.Module): if set different to 0, the number of depthwise_seperable_out_channel will be used as a channel_out of the second conv1d layer. - otherwise, it equal to 0, the second conv1d layer is skipped. + otherwise, it equals to 0, the second conv1d layer is skipped. depthwise_multiplier: int number of input_dim channels duplication. this value will be used to compute the hidden channels of the Conv1D. @@ -115,7 +115,7 @@ class ConformerEncoderLayer(nn.Module): we recalculate activation in backward. default "". export: bool, optional - if set to True, it remove the padding from convolutional layers + if set to True, it removes the padding from convolutional layers and allow the onnx conversion for inference. default False. use_pt_scaled_dot_product_attention: bool, optional @@ -686,7 +686,7 @@ class ConformerEncoder(TransformerEncoderBase): only work for glu_in_attention !=0 default "swish". export: bool, optional - if set to True, it remove the padding from convolutional layers + if set to True, it removes the padding from convolutional layers and allow the onnx conversion for inference. default False. activation_checkpointing: str, optional diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py index c4890d8427e2a..59535503822d4 100644 --- a/vllm/model_executor/models/phi4mm_utils.py +++ b/vllm/model_executor/models/phi4mm_utils.py @@ -258,7 +258,7 @@ class DepthWiseSeperableConv1d(nn.Module): if set different to 0, the number of depthwise_seperable_out_channel will be used as a channel_out of the second conv1d layer. - otherwise, it equal to 0, the second conv1d layer is skipped. + otherwise, it equals to 0, the second conv1d layer is skipped. kernel_size: int kernel_size depthwise_multiplier: int diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py index d215e5d8bf657..c06aa567444d8 100644 --- a/vllm/third_party/pynvml.py +++ b/vllm/third_party/pynvml.py @@ -1022,7 +1022,7 @@ def _extractNVMLErrorsAsClasses(): Each NVML Error gets a new NVMLError subclass. This way try,except blocks can filter appropriate exceptions more easily. - NVMLError is a parent class. Each NVML_ERROR_* gets it's own subclass. + NVMLError is a parent class. Each NVML_ERROR_* gets its own subclass. e.g. NVML_ERROR_ALREADY_INITIALIZED will be turned into NVMLError_AlreadyInitialized ''' this_module = sys.modules[__name__] diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 9a7243b1262c0..090fefa14203e 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -26,7 +26,7 @@ logger = logging.get_logger(__name__) class NemotronConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a - [`NemotronModel`]. It is used to instantiate an Nemotron model + [`NemotronModel`]. It is used to instantiate a Nemotron model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Nemotron-8B. diff --git a/vllm/transformers_utils/configs/nemotron_h.py b/vllm/transformers_utils/configs/nemotron_h.py index 027f2911543f5..581bed5716c1c 100644 --- a/vllm/transformers_utils/configs/nemotron_h.py +++ b/vllm/transformers_utils/configs/nemotron_h.py @@ -38,7 +38,7 @@ class NemotronHConfig(PretrainedConfig): passed when calling [`NemotronHModel`] tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be - tied. Note that this is only relevant if the model has a output + tied. Note that this is only relevant if the model has an output word embedding layer. hidden_size (`int`, *optional*, defaults to 4096): Dimension of the hidden representations. diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py index 557d251c45f3b..0077a7a8ce656 100644 --- a/vllm/transformers_utils/processors/ovis.py +++ b/vllm/transformers_utils/processors/ovis.py @@ -55,7 +55,7 @@ class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call- class OvisProcessor(ProcessorMixin): r""" - Constructs a Ovis processor which wraps a Ovis image processor and a Qwen2 tokenizer into a single processor. + Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor. [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information. Args: diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py index d3273257ff8c2..282e9cb2116e0 100644 --- a/vllm/transformers_utils/processors/ovis2_5.py +++ b/vllm/transformers_utils/processors/ovis2_5.py @@ -41,7 +41,7 @@ class Ovis2_5ProcessorKwargs(ProcessingKwargs, class Ovis2_5Processor(ProcessorMixin): r""" - Constructs a Ovis processor which wraps a Ovis image processor + Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor. [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py index fbcf2cb50d371..b92e396d4536e 100644 --- a/vllm/v1/spec_decode/ngram_proposer.py +++ b/vllm/v1/spec_decode/ngram_proposer.py @@ -107,7 +107,7 @@ def _find_longest_matched_ngram_and_propose_tokens( longest_ngram = 0 position = 0 - # lps[0] always equal to 0, we starts with index 1 + # lps[0] always equal to 0, we start with index 1 prev_lps = 0 i = 1 while i < total_token: From d46934b2297eb3dbda24c3bf26f6655d88ba99bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Mon, 1 Sep 2025 12:07:46 +0200 Subject: [PATCH 04/95] [Frontend] Gemma3n audio `transcriptions`/`translations` endpoint (#23735) Signed-off-by: NickLucche Co-authored-by: Cyrus Leung --- tests/entrypoints/openai/conftest.py | 27 +++++++ .../openai/test_transcription_validation.py | 35 +++++---- .../openai/test_translation_validation.py | 78 +++++++++++-------- vllm/entrypoints/openai/protocol.py | 19 +++++ vllm/entrypoints/openai/speech_to_text.py | 7 +- vllm/model_executor/models/gemma3n_mm.py | 65 +++++++++++++++- vllm/model_executor/models/interfaces.py | 6 +- vllm/model_executor/models/voxtral.py | 8 +- vllm/model_executor/models/whisper.py | 7 +- 9 files changed, 189 insertions(+), 63 deletions(-) create mode 100644 tests/entrypoints/openai/conftest.py diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/openai/conftest.py new file mode 100644 index 0000000000000..0ecdd4245df43 --- /dev/null +++ b/tests/entrypoints/openai/conftest.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from vllm.assets.audio import AudioAsset + + +@pytest.fixture +def mary_had_lamb(): + path = AudioAsset('mary_had_lamb').get_local_path() + with open(str(path), "rb") as f: + yield f + + +@pytest.fixture +def winning_call(): + path = AudioAsset('winning_call').get_local_path() + with open(str(path), "rb") as f: + yield f + + +@pytest.fixture +def foscolo(): + # Test translation it->en + path = AudioAsset('azacinto_foscolo').get_local_path() + with open(str(path), "rb") as f: + yield f diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 6009d9aeec935..6a3cdfdfc8081 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -12,8 +12,6 @@ import pytest import pytest_asyncio import soundfile as sf -from vllm.assets.audio import AudioAsset - from ...utils import RemoteOpenAIServer MODEL_NAME = "openai/whisper-large-v3-turbo" @@ -24,20 +22,6 @@ MISTRAL_FORMAT_ARGS = [ ] -@pytest.fixture -def mary_had_lamb(): - path = AudioAsset('mary_had_lamb').get_local_path() - with open(str(path), "rb") as f: - yield f - - -@pytest.fixture -def winning_call(): - path = AudioAsset('winning_call').get_local_path() - with open(str(path), "rb") as f: - yield f - - @pytest.fixture(scope="module") def server(): with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server: @@ -76,6 +60,25 @@ async def test_basic_audio(mary_had_lamb, model_name): assert out_usage["seconds"] == 16, out_usage["seconds"] +@pytest.mark.asyncio +async def test_basic_audio_gemma(foscolo): + # Gemma accuracy on some of the audio samples we use is particularly bad, + # hence we use a different one here. WER is evaluated separately. + model_name = "google/gemma-3n-E2B-it" + server_args = ["--enforce-eager"] + + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + transcription = await client.audio.transcriptions.create( + model=model_name, + file=foscolo, + language="it", + response_format="text", + temperature=0.0) + out = json.loads(transcription)['text'] + assert "da cui vergine nacque Venere" in out + + @pytest.mark.asyncio async def test_non_asr_model(winning_call): # text to text model diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index f4f5c66f2deeb..f43b7a253d28d 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -12,32 +12,24 @@ import pytest import pytest_asyncio import soundfile as sf -from vllm.assets.audio import AudioAsset - from ...utils import RemoteOpenAIServer -MODEL_NAME = "openai/whisper-small" SERVER_ARGS = ["--enforce-eager"] -@pytest.fixture -def foscolo(): - # Test translation it->en - path = AudioAsset('azacinto_foscolo').get_local_path() - with open(str(path), "rb") as f: - yield f - - -@pytest.fixture(scope="module") -def server(): - with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server: - yield remote_server +@pytest.fixture(scope="module", + params=["openai/whisper-small", "google/gemma-3n-E2B-it"]) +def server(request): + # Parametrize over model name + with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server: + yield remote_server, request.param @pytest_asyncio.fixture -async def client(server): +async def client_and_model(server): + server, model_name = server async with server.get_async_client() as async_client: - yield async_client + yield async_client, model_name @pytest.mark.asyncio @@ -56,27 +48,29 @@ async def test_non_asr_model(foscolo): # NOTE: (NickLucche) the large-v3-turbo model was not trained on translation! @pytest.mark.asyncio -async def test_basic_audio(foscolo, client): +async def test_basic_audio(foscolo, client_and_model): + client, model_name = client_and_model translation = await client.audio.translations.create( - model=MODEL_NAME, + model=model_name, file=foscolo, response_format="text", - # TODO remove once language detection is implemented - extra_body=dict(language="it"), + # TODO remove `language="it"` once language detection is implemented + extra_body=dict(language="it", to_language="en"), temperature=0.0) out = json.loads(translation)['text'].strip().lower() assert "greek sea" in out @pytest.mark.asyncio -async def test_audio_prompt(foscolo, client): +async def test_audio_prompt(foscolo, client_and_model): + client, model_name = client_and_model # Condition whisper on starting text prompt = "Nor have I ever" transcription = await client.audio.translations.create( - model=MODEL_NAME, + model=model_name, file=foscolo, prompt=prompt, - extra_body=dict(language="it"), + extra_body=dict(language="it", to_language="en"), response_format="text", temperature=0.0) out = json.loads(transcription)['text'] @@ -85,22 +79,27 @@ async def test_audio_prompt(foscolo, client): @pytest.mark.asyncio -async def test_streaming_response(foscolo, client, server): +async def test_streaming_response(foscolo, client_and_model, server): + client, model_name = client_and_model translation = "" res_no_stream = await client.audio.translations.create( - model=MODEL_NAME, + model=model_name, file=foscolo, response_format="json", - extra_body=dict(language="it"), + extra_body=dict(language="it", to_language="en", seed=42), temperature=0.0) + # Stream via HTTPX since OpenAI translation client doesn't expose streaming + server, model_name = server url = server.url_for("v1/audio/translations") headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"} data = { - "model": MODEL_NAME, + "model": model_name, "language": "it", + "to_language": "en", "stream": True, "temperature": 0.0, + "seed": 42, } foscolo.seek(0) async with httpx.AsyncClient() as http_client: @@ -121,16 +120,24 @@ async def test_streaming_response(foscolo, client, server): text = chunk["choices"][0].get("delta", {}).get("content") translation += text or "" - assert translation == res_no_stream.text + res_stream = translation.split() + # NOTE There's a small non-deterministic issue here, likely in the attn + # computation, which will cause a few tokens to be different, while still + # being very close semantically. + assert sum([ + x == y for x, y in zip(res_stream, res_no_stream.text.split()) + ]) >= len(res_stream) * 0.9 @pytest.mark.asyncio -async def test_stream_options(foscolo, client, server): +async def test_stream_options(foscolo, server): + server, model_name = server url = server.url_for("v1/audio/translations") headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"} data = { - "model": MODEL_NAME, + "model": model_name, "language": "it", + "to_language": "en", "stream": True, "stream_include_usage": True, "stream_continuous_usage_stats": True, @@ -164,7 +171,10 @@ async def test_stream_options(foscolo, client, server): @pytest.mark.asyncio -async def test_long_audio_request(foscolo, client): +async def test_long_audio_request(foscolo, client_and_model): + client, model_name = client_and_model + if model_name == "google/gemma-3n-E2B-it": + pytest.skip("Gemma3n does not support long audio requests") foscolo.seek(0) audio, sr = librosa.load(foscolo) repeated_audio = np.tile(audio, 2) @@ -173,9 +183,9 @@ async def test_long_audio_request(foscolo, client): sf.write(buffer, repeated_audio, sr, format='WAV') buffer.seek(0) translation = await client.audio.translations.create( - model=MODEL_NAME, + model=model_name, file=buffer, - extra_body=dict(language="it"), + extra_body=dict(language="it", to_language="en"), response_format="text", temperature=0.0) out = json.loads(translation)['text'].strip().lower() diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 12b274e1211bc..00b72f74cec85 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -2175,6 +2175,13 @@ class TranscriptionRequest(OpenAIBaseModel): ) # --8<-- [end:transcription-extra-params] + to_language: Optional[str] = None + """The language of the output audio we transcribe to. + + Please note that this is not currently used by supported models at this + time, but it is a placeholder for future use, matching translation api. + """ + # --8<-- [start:transcription-sampling-params] temperature: float = Field(default=0.0) """The sampling temperature, between 0 and 1. @@ -2408,6 +2415,9 @@ class TranslationRequest(OpenAIBaseModel): # TODO support additional sampling parameters # --8<-- [start:translation-sampling-params] + seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) + """The seed to use for sampling.""" + temperature: float = Field(default=0.0) """The sampling temperature, between 0 and 1. @@ -2427,6 +2437,14 @@ class TranslationRequest(OpenAIBaseModel): will improve accuracy. """ + to_language: Optional[str] = None + """The language of the input audio we translate to. + + Please note that this is not supported by all models, refer to the specific + model documentation for more details. + For instance, Whisper only supports `to_language=en`. + """ + stream: Optional[bool] = False """Custom field not present in the original OpenAI definition. When set, it will enable output to be streamed in a similar fashion as the Chat @@ -2458,6 +2476,7 @@ class TranslationRequest(OpenAIBaseModel): return SamplingParams.from_optional(temperature=temperature, max_tokens=max_tokens, + seed=self.seed, output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY) diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 1cbd7dba393f6..965bdac3ac5ad 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -89,6 +89,9 @@ class OpenAISpeechToText(OpenAIServing): ) -> tuple[list[PromptType], float]: # Validate request language = self.model_cls.validate_language(request.language) + # Skip to_language validation to avoid extra logging for Whisper. + to_language = self.model_cls.validate_language(request.to_language) \ + if request.to_language else None if len(audio_data) / 1024**2 > self.max_audio_filesize_mb: raise ValueError("Maximum file size exceeded.") @@ -112,7 +115,9 @@ class OpenAISpeechToText(OpenAIServing): model_config=self.model_config, language=language, task_type=self.task_type, - request_prompt=request.prompt) + request_prompt=request.prompt, + to_language=to_language, + ) prompts.append(prompt) return prompts, duration diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index d59dde1560aea..c25bbcd420c39 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Optional, TypedDict, Union, cast +from typing import Any, Literal, Optional, TypedDict, Union, cast +import numpy as np import torch from torch import nn from transformers import AutoModel, BatchFeature @@ -13,7 +14,8 @@ from transformers.models.gemma3n import (Gemma3nAudioConfig, Gemma3nVisionConfig) from transformers.models.siglip import SiglipImageProcessorFast -from vllm.config import VllmConfig +from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import RowParallelLinear @@ -21,6 +23,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.models.gemma3n import Gemma3nForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.models.whisper import ISO639_1_SUPPORTED_LANGS from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -40,7 +43,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from .interfaces import MultiModalEmbeddings, SupportsMultiModal +from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, + SupportsTranscription) from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -410,7 +414,10 @@ class Gemma3nMultimodalEmbedder(nn.Module): @MULTIMODAL_REGISTRY.register_processor(Gemma3nMultiModalProcessor, info=Gemma3nProcessingInfo, dummy_inputs=Gemma3nDummyInputsBuilder) -class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal): +class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsTranscription): + supported_languages = ISO639_1_SUPPORTED_LANGS + packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -694,3 +701,53 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal): return "" else: raise ValueError(f"Unsupported modality: {modality}") + + @classmethod + def get_generation_prompt(cls, audio: np.ndarray, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + language: Optional[str], + task_type: Literal["transcribe", "translate"], + request_prompt: str, + to_language: Optional[str]) -> PromptType: + """ + Gemma3n supports "free-form" transcription. + We fix its prompt here to standardize transcriptions/translations + requests. + """ + # Transcribe this audio [into <>] | for transcription + # Translate this audio [from <> into <>] | for translation + prompt = "user\n" + prompt += "Transcribe" if task_type == "transcribe" else "Translate" + prompt += " this audio" + + # We assume the language is a valid ISO 639-1 code. + full_lang_name = cls.supported_languages.get(language, "") + # Translation only for now + full_lang_name_to = cls.supported_languages.get(to_language, "") + + if task_type == "transcribe" and full_lang_name: + prompt += f" into {full_lang_name}" + elif task_type == "translate": + if full_lang_name: + prompt += f" from {full_lang_name}" + if full_lang_name_to: + prompt += f" into {full_lang_name_to}" + + prompt += ": \nmodel\n" + + audio = (audio, stt_config.sample_rate) + prompts_dict = {"multi_modal_data": {"audio": audio}, "prompt": prompt} + return cast(PromptType, prompts_dict) + + @classmethod + def get_speech_to_text_config(cls, model_config: ModelConfig, + task_type: str) -> SpeechToTextConfig: + return SpeechToTextConfig( + # Let's set this to 30 as suggested in the docs for now, although + # the model is only limited by its context length. + max_audio_clip_s=30, + sample_rate=16000, + # TODO enable chunking after more thorough testing. + min_energy_split_window_size=None, + ) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 2ee966fb5c0c8..d5b71b057831b 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -700,8 +700,10 @@ class SupportsTranscription(Protocol): def get_generation_prompt(cls, audio: np.ndarray, stt_config: SpeechToTextConfig, model_config: ModelConfig, - language: Optional[str], task_type: str, - request_prompt: str) -> PromptType: + language: Optional[str], + task_type: Literal["transcribe", "translate"], + request_prompt: str, + to_language: Optional[str]) -> PromptType: """Get the prompt for the ASR model. The model has control over the construction, as long as it returns a valid PromptType.""" diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 6bc748407a7d1..f3731b389cfe0 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -5,7 +5,7 @@ import math from collections.abc import Iterable, Mapping, Sequence from functools import cached_property from math import ceil -from typing import Optional, Union, cast +from typing import Literal, Optional, Union, cast import numpy as np import regex as re @@ -455,8 +455,10 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal, def get_generation_prompt(cls, audio: np.ndarray, model_config: ModelConfig, stt_config: SpeechToTextConfig, - language: Optional[str], task_type: str, - request_prompt: str) -> PromptType: + language: Optional[str], + task_type: Literal["transcribe", "translate"], + request_prompt: str, + to_language: Optional[str]) -> PromptType: tokenizer = cached_tokenizer_from_config(model_config) audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 16bbe2f2010a1..848b6e0f8093a 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -4,7 +4,7 @@ import math from collections.abc import Iterable, Mapping, Sequence from contextlib import nullcontext -from typing import Optional, TypedDict, Union, cast +from typing import Literal, Optional, TypedDict, Union, cast import numpy as np import torch @@ -783,8 +783,9 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, model_config: ModelConfig, # not needed here stt_config: SpeechToTextConfig, language: Optional[str], - task_type: str, - request_prompt: str) -> PromptType: + task_type: Literal["transcribe", "translate"], + request_prompt: str, + to_language: Optional[str]) -> PromptType: if language is None: raise ValueError( "Language must be specified when creating the Whisper prompt") From 3e330fcb218e207bbc9eec3ed479a8b53b25c98d Mon Sep 17 00:00:00 2001 From: Kay Yan Date: Mon, 1 Sep 2025 18:34:52 +0800 Subject: [PATCH 05/95] [Doc]: Fix CPU install docs: force torch-backend=cpu to avoid GPU torchvision errors (#24033) Signed-off-by: Kay Yan --- docs/getting_started/installation/cpu/build.inc.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md index 57a09e674a821..4bd4d39a6f80b 100644 --- a/docs/getting_started/installation/cpu/build.inc.md +++ b/docs/getting_started/installation/cpu/build.inc.md @@ -16,8 +16,8 @@ cd vllm_source Third, install required dependencies: ```bash -uv pip install -r requirements/cpu-build.txt --torch-backend auto -uv pip install -r requirements/cpu.txt --torch-backend auto +uv pip install -r requirements/cpu-build.txt --torch-backend cpu +uv pip install -r requirements/cpu.txt --torch-backend cpu ``` ??? console "pip" From 7c8271cd1e85670bd60c058441e1a224421dea01 Mon Sep 17 00:00:00 2001 From: Kwai-Keye Date: Mon, 1 Sep 2025 18:50:27 +0800 Subject: [PATCH 06/95] [Model]: support KeyeVL-1_5-8B (#23838) Signed-off-by: wangruitao Co-authored-by: wangruitao --- docs/models/supported_models.md | 3 +- examples/offline_inference/vision_language.py | 32 + .../vision_language_multi_image.py | 38 ++ .../multimodal/processing/test_common.py | 1 + tests/models/registry.py | 2 + .../layers/rotary_embedding/mrope.py | 129 ++++ vllm/model_executor/models/keye.py | 594 +++++++++-------- vllm/model_executor/models/keye_vl1_5.py | 601 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 9 files changed, 1123 insertions(+), 278 deletions(-) create mode 100644 vllm/model_executor/models/keye_vl1_5.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index e8fe77e8d6c98..4b4cebb6a31c2 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -634,7 +634,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + IE+ + VE+ | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ | +| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | ✅︎ | ✅︎ | ✅︎ | +| `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ | | `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + IE+ | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 4e879666f61d7..b104113b88213 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -683,6 +683,37 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData: ) +# Keye-VL-1.5 +def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData: + model_name = "Kwai-Keye/Keye-VL-1.5-8B" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + trust_remote_code=True, + limit_mm_per_prompt={modality: 1}, + ) + + if modality == "image": + placeholder = "<|image_pad|>" + elif modality == "video": + placeholder = "<|video_pad|>" + + prompts = [ + ( + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # Kimi-VL def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1648,6 +1679,7 @@ model_example_map = { "interns1": run_interns1, "internvl_chat": run_internvl, "keye_vl": run_keye_vl, + "keye_vl1_5": run_keye_vl1_5, "kimi_vl": run_kimi_vl, "llama4": run_llama4, "llava": run_llava, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index d9242efa85470..01c2905cf26d8 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -542,6 +542,43 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "Kwai-Keye/Keye-VL-1_5-8B" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + max_num_seqs=5, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + }, + ] + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + image_data = [fetch_image(url) for url in image_urls] + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=image_data, + ) + + def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "moonshotai/Kimi-VL-A3B-Instruct" @@ -1209,6 +1246,7 @@ model_example_map = { "interns1": load_interns1, "internvl_chat": load_internvl, "keye_vl": load_keye_vl, + "keye_vl1_5": load_keye_vl1_5, "kimi_vl": load_kimi_vl, "llama4": load_llama4, "llava": load_llava, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 3ff4360b83345..16c0428c6d8f1 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -293,6 +293,7 @@ def _test_processing_correctness_one( "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview", "OpenGVLab/InternVL3_5-30B-A3B", "Kwai-Keye/Keye-VL-8B-Preview", + "Kwai-Keye/Keye-VL-1_5-8B", "moonshotai/Kimi-VL-A3B-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct", "llava-hf/llava-1.5-7b-hf", diff --git a/tests/models/registry.py b/tests/models/registry.py index a37ffdc311514..3b5cec2dc7022 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -438,6 +438,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"), # noqa: E501 "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 trust_remote_code=True), + "KeyeVL1_5ForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-1_5-8B", # noqa: E501 + trust_remote_code=True), "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501 extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501 trust_remote_code=True), diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py index 5686ec7b35de8..0ab4bc5375daf 100644 --- a/vllm/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm/model_executor/layers/rotary_embedding/mrope.py @@ -402,6 +402,15 @@ class MRotaryEmbedding(RotaryEmbedding): context_len=context_len, seq_len=seq_len, ) + elif "KeyeVL1_5" in hf_config.model_type: + return cls._keye_get_input_positions_tensor( + input_tokens=input_tokens, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + context_len=context_len, + seq_len=seq_len, + ) else: return cls._vl_get_input_positions_tensor( input_tokens=input_tokens, @@ -636,6 +645,126 @@ class MRotaryEmbedding(RotaryEmbedding): len(input_tokens)).item() return llm_positions, mrope_position_delta + @classmethod + def _keye_get_input_positions_tensor( + cls, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: Union[list[list[int]], torch.Tensor], + video_grid_thw: Union[list[list[int]], torch.Tensor], + context_len: int = 0, + seq_len: Optional[int] = None, + ) -> tuple[torch.Tensor, int]: + if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0: + video_grid_thw = video_grid_thw[0] + """Get mrope input positions and delta value (Keye series).""" + + def split_thw( + grid_thw: Union[torch.Tensor, list[int]]) -> list[list[int]]: + """ + Split grid_thw along the t dimension. + + Args: + grid_thw: shape [N, 3] tensor or nested list of [t, h, w]. + + Returns: + List of [1, h, w] rows, repeated t times for each original row. + """ + + if isinstance(grid_thw, list): + grid_thw = torch.tensor(grid_thw, dtype=torch.long) + + if grid_thw.numel() == 0: + return [] + + t, hw = grid_thw[:, 0], grid_thw[:, 1:] + ones = torch.ones_like(hw[:, :1]) # [N,1] + out = torch.cat([ones, hw], dim=1).repeat_interleave(t, dim=0) + return out.tolist() + + video_grid_thw = split_thw(video_grid_thw) + + image_token_id = hf_config.image_token_id + video_token_id = hf_config.video_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + + image_nums = len(image_grid_thw) + frame_nums = len(video_grid_thw) + llm_pos_ids_list: list = [] + + st = 0 + remain_images, remain_frames = image_nums, frame_nums + + image_index, video_index = 0, 0 + for _ in range(image_nums + frame_nums): + if remain_images > 0: + try: + ed_image = input_tokens.index(image_token_id, st) + except ValueError: + ed_image = len(input_tokens) + 1 + else: + ed_image = len(input_tokens) + 1 + if remain_frames > 0: + try: + ed_video = input_tokens.index(video_token_id, st) + except ValueError: + ed_video = len(input_tokens) + 1 + else: + ed_video = len(input_tokens) + 1 + + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_index += 1 + remain_frames -= 1 + ed = ed_video + + llm_grid_t, llm_grid_h, llm_grid_w = \ + t, h // spatial_merge_size, w // spatial_merge_size + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + t_index = (torch.arange(llm_grid_t).view(-1, 1).expand( + -1, llm_grid_h * llm_grid_w)).long().flatten() + + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( + llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( + llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append( + torch.stack([t_index, h_index, w_index]) + text_len + st_idx) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len( + llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - + len(input_tokens)).item() + llm_positions = llm_positions[:, context_len:seq_len] + + return llm_positions, mrope_position_delta + @classmethod def _vl_get_input_positions_tensor( cls, diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index c6dbd62b905e1..710b805acb3ea 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math +from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence from functools import partial -from typing import Annotated, Any, Literal, Optional, Union +from typing import Annotated, Any, Literal, Optional, TypeVar, Union import numpy as np import torch @@ -57,16 +58,13 @@ from .vision import get_vit_attn_backend logger = init_logger(__name__) -_MAX_FRAMES_PER_VIDEO = 16 -_MAX_IMAGE_SIZE = 9999999 - def smart_resize( height: int, width: int, - factor: int = 28, - min_pixels: int = 28 * 28 * 130, - max_pixels: int = 28 * 28 * 1280, + factor: int, + min_pixels: int, + max_pixels: int, ): if height < factor: logger.warning( @@ -887,9 +885,9 @@ class Projector(nn.Module): def forward( self, - image_features: torch.Tensor, + image_features: Union[torch.Tensor, list[torch.Tensor]], image_grid_thw: list[tuple[int, int, int]], - ) -> torch.Tensor: + ) -> Union[torch.Tensor, list[torch.Tensor]]: m1, m2 = self.merge_kernel_size if isinstance(image_features, (list, tuple)): processed_features = list() @@ -986,6 +984,12 @@ class KeyeMultiModalDataParser(MultiModalDataParser): class KeyeProcessingInfo(BaseProcessingInfo): + def get_max_image_size(self) -> int: + return 9999999 #_MAX_IMAGE_SIZE + + def get_max_frame_per_video(self) -> int: + return 16 #_MAX_FRAMES_PER_VIDEO + def get_image_processor(self, **kwargs: object): return self.get_hf_processor(**kwargs).image_processor @@ -1077,8 +1081,8 @@ class KeyeProcessingInfo(BaseProcessingInfo): def get_image_size_with_most_features(self, ) -> ImageSize: max_image_size, _ = self._get_vision_info( - image_width=_MAX_IMAGE_SIZE, - image_height=_MAX_IMAGE_SIZE, + image_width=self.get_max_image_size(), + image_height=self.get_max_image_size(), image_processor=None, ) return max_image_size @@ -1123,7 +1127,7 @@ class KeyeProcessingInfo(BaseProcessingInfo): max_image_tokens) max_frames_per_video = min( max_total_frames // max(max_videos, 1), - _MAX_FRAMES_PER_VIDEO, + self.get_max_frame_per_video(), ) return max(max_frames_per_video, 1) @@ -1139,7 +1143,10 @@ class KeyeProcessingInfo(BaseProcessingInfo): ) -class KeyeDummyInputsBuilder(BaseDummyInputsBuilder[KeyeProcessingInfo]): +_I = TypeVar("_I", bound=KeyeProcessingInfo) + + +class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -1183,6 +1190,10 @@ class KeyeDummyInputsBuilder(BaseDummyInputsBuilder[KeyeProcessingInfo]): return mm_data +class KeyeDummyInputsBuilder(KeyeBaseDummyInputsBuilder[KeyeProcessingInfo]): + ... + + class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: @@ -1231,13 +1242,7 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]): return _keye_field_config(hf_inputs) -@MULTIMODAL_REGISTRY.register_processor( - KeyeMultiModalProcessor, - info=KeyeProcessingInfo, - dummy_inputs=KeyeDummyInputsBuilder, -) -class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, - SupportsPP): +class BaseKeyeModule(nn.Module): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -1264,6 +1269,11 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, raise ValueError("Only image or video modality is supported") + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): + if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + return None + return quant_config + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: PretrainedConfig = vllm_config.model_config.hf_config @@ -1278,7 +1288,8 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, quant_config=self._maybe_ignore_quant_config(quant_config), prefix=maybe_prefix(prefix, "visual"), ) - self.mlp_AR = Projector( + + self.mlp_AR = self._build_projector( config, config.vision_config, quant_config=self._maybe_ignore_quant_config(quant_config), @@ -1294,13 +1305,287 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): - if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): - return None - return quant_config + @abstractmethod + def _build_projector(self, + text_config: PretrainedConfig, + vision_config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> nn.Module: + raise ValueError("Need projector") - def _validate_and_reshape_mm_tensor(self, mm_input: NestedTensors, - name: str) -> torch.Tensor: + def _process_image_input(self, + image_input: Any) -> tuple[torch.Tensor, ...]: + siglip_position_ids = list() + image_grid_hws = list() + sample_indices = list() + cu_seqlens = [0] + + image_grid_thw = image_input["image_grid_thw"] + assert image_grid_thw.ndim == 2 + + for idx, thaw in enumerate(image_grid_thw): + thw_tuple = tuple(thaw.detach().cpu().numpy().tolist()) + numel = np.prod(thw_tuple) + image_grid_hws.append(thw_tuple) + image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:]) + siglip_position_ids.append(image_position_ids) + sample_indices.append(torch.full((numel, ), idx, + dtype=torch.int64)) + cu_seqlens.append(cu_seqlens[-1] + numel) + + if image_input["type"] == "image_embeds": + raise ValueError( + "Image embeddings are not supported for this processing path.") + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + siglip_position_ids = torch.concat(siglip_position_ids, + dim=0).to(pixel_values.device) + cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to( + pixel_values.device) + sample_indices = torch.concat(sample_indices, + dim=0).to(pixel_values.device) + + image_embeds = self.visual( + pixel_values=pixel_values, + image_grid_thw=image_grid_hws, + position_ids=siglip_position_ids, + vision_return_embed_list=False, + interpolate_pos_encoding=True, + sample_indices=sample_indices, + cu_seqlens=cu_seqlens, + use_rope=True, + window_size=-1, + ) + image_embeds = tuple(self.mlp_AR(image_embeds, image_grid_thw)) + return image_embeds + + def _process_video_embeds( + self, + video_type: Literal["video_embeds", "pixel_values_videos"], + video_grid_thw: list[torch.Tensor], + pixel_values_videos: Optional[torch.Tensor] = None + ) -> Union[torch.Tensor, list[torch.Tensor]]: + siglip_position_ids = list() + video_grid_hws = list() + sample_indices = list() + cu_seqlens = [0] + + assert video_grid_thw.ndim == 2 + for idx, sub_thw in enumerate(video_grid_thw): + thw_tuple = tuple(sub_thw.detach().cpu().numpy().tolist()) + numel = np.prod(thw_tuple) + + video_grid_hws.append(thw_tuple) + video_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:]) + siglip_position_ids.append(video_position_ids) + sample_indices.append(torch.full((numel, ), idx, + dtype=torch.int64)) + cu_seqlens.append(cu_seqlens[-1] + numel) + + if video_type == "video_embeds": + raise ValueError( + "Video embeddings are not supported for this processing path.") + else: + pixel_values_videos = pixel_values_videos.type(self.visual.dtype) + siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to( + pixel_values_videos.device) + cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to( + pixel_values_videos.device) + sample_indices = torch.concat(sample_indices, + dim=0).to(pixel_values_videos.device) + + video_embeds = self.visual( + pixel_values=pixel_values_videos, + image_grid_thw=video_grid_hws, + position_ids=siglip_position_ids, + vision_return_embed_list=True, + interpolate_pos_encoding=True, + sample_indices=sample_indices, + cu_seqlens=cu_seqlens, + use_rope=True, + window_size=-1, + ) + video_embeds = self.mlp_AR(video_embeds, video_grid_thw) + return video_embeds + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + for input_key in kwargs: + if (input_key in ("pixel_values", "image_embeds") + and "images" not in modalities): + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if (input_key in ("pixel_values_videos", "video_embeds") + and "videos" not in modalities): + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) + + return modalities + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: + return None + + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_video_input(video_input) + multimodal_embeddings += video_embeddings + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + multimodal_embeddings, + [ + self.config.image_token_id, + self.config.video_token_id, + ], + ) + return inputs_embeds + + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[Any] = None, + video_input: Optional[Any] = None, + ) -> torch.Tensor: + inputs_embeds = self.get_input_embeddings(input_ids) + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_id, + ) + + if video_input is not None: + video_embeds = self._process_video_input(video_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_id, + ) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + """Run forward pass for Keye-VL. + + Args: + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + positions: Flattened (concatenated) position ids corresponding to a + batch. + **NOTE**: If mrope is enabled (default setting for Qwen2-VL + opensource models), the shape will be `(3, seq_len)`, + otherwise it will be `(seq_len,). + pixel_values: Pixel values to be fed to a model. + `None` if no images are passed. + image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. + `None` if no images are passed. + pixel_values_videos: Pixel values of videos to be fed to a model. + `None` if no videos are passed. + video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. + `None` if no videos are passed. + """ + if intermediate_tensors is not None: + inputs_embeds = None + + elif inputs_embeds is None: + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + if image_input is None and video_input is None: + inputs_embeds = None + else: + if uses_mrope(self.config): + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input, + ) + input_ids = None + + hidden_states = self.language_model.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + """Get the module prefix in multimodal models.""" + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="mlp_AR.", + tower_model="visual.", + ) + + +@MULTIMODAL_REGISTRY.register_processor( + KeyeMultiModalProcessor, + info=KeyeProcessingInfo, + dummy_inputs=KeyeDummyInputsBuilder, +) +class KeyeForConditionalGeneration(BaseKeyeModule, SupportsMultiModal, + SupportsLoRA, SupportsPP): + + def _build_projector(self, + text_config: PretrainedConfig, + vision_config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> nn.Module: + return Projector(text_config, vision_config, quant_config, prefix) + + def _validate_and_reshape_mm_tensor( + self, mm_input: NestedTensors, + name: str) -> Union[torch.Tensor, list[torch.Tensor]]: if not isinstance(mm_input, (torch.Tensor, list)): raise ValueError(f"Incorrect type of {name}. " f"Got type: {type(mm_input)}") @@ -1388,257 +1673,12 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, video_grid_thw=video_grid_thw, ) - def _process_image_input( - self, image_input: KeyeImageInputs) -> tuple[torch.Tensor, ...]: - siglip_position_ids = list() - image_grid_hws = list() - sample_indices = list() - cu_seqlens = [0] - - image_grid_thw = image_input["image_grid_thw"] - assert image_grid_thw.ndim == 2 - - for idx, thaw in enumerate(image_grid_thw): - thw_tuple = tuple(thaw.detach().cpu().numpy().tolist()) - numel = np.prod(thw_tuple) - image_grid_hws.append(thw_tuple) - image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:]) - siglip_position_ids.append(image_position_ids) - sample_indices.append(torch.full((numel, ), idx, - dtype=torch.int64)) - cu_seqlens.append(cu_seqlens[-1] + numel) - - if image_input["type"] == "image_embeds": - raise ValueError( - "Image embeddings are not supported for this processing path.") - else: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - siglip_position_ids = torch.concat(siglip_position_ids, - dim=0).to(pixel_values.device) - cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to( - pixel_values.device) - sample_indices = torch.concat(sample_indices, - dim=0).to(pixel_values.device) - - image_embeds = self.visual( - pixel_values=pixel_values, - image_grid_thw=image_grid_hws, - position_ids=siglip_position_ids, - vision_return_embed_list=False, - interpolate_pos_encoding=True, - sample_indices=sample_indices, - cu_seqlens=cu_seqlens, - use_rope=True, - window_size=-1, - ) - image_embeds = tuple(self.mlp_AR(image_embeds, image_grid_thw)) - return image_embeds - def _process_video_input( self, video_input: KeyeVideoInputs) -> tuple[torch.Tensor, ...]: - siglip_position_ids = list() - video_grid_hws = list() - sample_indices = list() - cu_seqlens = [0] - + video_type = video_input["type"] video_grid_thw = video_input["video_grid_thw"] - assert video_grid_thw.ndim == 2 + pixel_values_videos = video_input.get("pixel_values_videos", None) - for idx, thaw in enumerate(video_grid_thw): - thw_tuple = tuple(thaw.detach().cpu().numpy().tolist()) - numel = np.prod(thw_tuple) - - video_grid_hws.append(thw_tuple) - video_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:]) - siglip_position_ids.append(video_position_ids) - sample_indices.append(torch.full((numel, ), idx, - dtype=torch.int64)) - cu_seqlens.append(cu_seqlens[-1] + numel) - - if video_input["type"] == "video_embeds": - raise ValueError( - "Video embeddings are not supported for this processing path.") - else: - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to( - pixel_values_videos.device) - cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to( - pixel_values_videos.device) - sample_indices = torch.concat(sample_indices, - dim=0).to(pixel_values_videos.device) - - video_embeds = self.visual( - pixel_values=pixel_values_videos, - image_grid_thw=video_grid_hws, - position_ids=siglip_position_ids, - vision_return_embed_list=True, - interpolate_pos_encoding=True, - sample_indices=sample_indices, - cu_seqlens=cu_seqlens, - use_rope=True, - window_size=-1, - ) - video_embeds = tuple(self.mlp_AR(video_embeds, video_grid_thw)) - return video_embeds - - def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: - modalities = {} - - for input_key in kwargs: - if (input_key in ("pixel_values", "image_embeds") - and "images" not in modalities): - modalities["images"] = self._parse_and_validate_image_input( - **kwargs) - if (input_key in ("pixel_values_videos", "video_embeds") - and "videos" not in modalities): - modalities["videos"] = self._parse_and_validate_video_input( - **kwargs) - - return modalities - - def get_language_model(self) -> torch.nn.Module: - return self.language_model - - def get_multimodal_embeddings( - self, **kwargs: object) -> Optional[MultiModalEmbeddings]: - - modalities = self._parse_and_validate_multimodal_inputs(**kwargs) - if not modalities: - return None - - multimodal_embeddings: tuple[torch.Tensor, ...] = () - - for modality in modalities: - if modality == "images": - image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings - if modality == "videos": - video_input = modalities["videos"] - video_embeddings = self._process_video_input(video_input) - multimodal_embeddings += video_embeddings - return multimodal_embeddings - - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: Optional[MultiModalEmbeddings] = None, - ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None: - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - multimodal_embeddings, - [ - self.config.image_token_id, - self.config.video_token_id, - ], - ) - return inputs_embeds - - def get_input_embeddings_v0( - self, - input_ids: torch.Tensor, - image_input: Optional[KeyeImagePixelInputs] = None, - video_input: Optional[KeyeVideoPixelInputs] = None, - ) -> torch.Tensor: - inputs_embeds = self.get_input_embeddings(input_ids) - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - return inputs_embeds - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - **kwargs: object, - ) -> Union[torch.Tensor, IntermediateTensors]: - """Run forward pass for Qwen2-VL. - - Args: - input_ids: Flattened (concatenated) input_ids corresponding to a - batch. - positions: Flattened (concatenated) position ids corresponding to a - batch. - **NOTE**: If mrope is enabled (default setting for Qwen2-VL - opensource models), the shape will be `(3, seq_len)`, - otherwise it will be `(seq_len,). - pixel_values: Pixel values to be fed to a model. - `None` if no images are passed. - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM. - `None` if no images are passed. - pixel_values_videos: Pixel values of videos to be fed to a model. - `None` if no videos are passed. - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. - `None` if no videos are passed. - """ - - if intermediate_tensors is not None: - inputs_embeds = None - - elif inputs_embeds is None: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - inputs_embeds = self.get_input_embeddings_v0( - input_ids, - image_input=image_input, - video_input=video_input, - ) - input_ids = None - - hidden_states = self.language_model.model( - input_ids=input_ids, - positions=positions, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - ) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - return self.language_model.compute_logits(hidden_states, - sampling_metadata) - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - - loader = AutoWeightsLoader(self) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) - - def get_mm_mapping(self) -> MultiModelKeys: - """Get the module prefix in multimodal models.""" - return MultiModelKeys.from_string_field( - language_model="language_model", - connector="visual.", - tower_model="mlp_AR.", - ) + return tuple( + self._process_video_embeds(video_type, video_grid_thw, + pixel_values_videos)) diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py new file mode 100644 index 0000000000000..605c6d3eaf643 --- /dev/null +++ b/vllm/model_executor/models/keye_vl1_5.py @@ -0,0 +1,601 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import itertools +from collections.abc import Mapping, Sequence +from functools import partial +from typing import Annotated, Any, Literal, Optional, Union + +import numpy as np +import torch +import torch.nn as nn +from einops import rearrange +from transformers import PretrainedConfig +from transformers.activations import GELUActivation +from transformers.feature_extraction_utils import BatchFeature + +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors +from vllm.multimodal.inputs import (ImageItem, ModalityData, + MultiModalFieldConfig, + MultiModalKwargsItems, VideoItem) +from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems, + MultiModalDataItems, MultiModalDataParser) +from vllm.multimodal.processing import (PromptReplacement, PromptUpdate, + PromptUpdateDetails) +from vllm.utils.tensor_schema import TensorSchema, TensorShape + +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP +from .keye import (BaseKeyeModule, BaseMultiModalProcessor, + KeyeBaseDummyInputsBuilder, KeyeProcessingInfo) + +logger = init_logger(__name__) + + +def split_thw(grid_thw: torch.Tensor) -> torch.Tensor: + """ + Split grid_thw in t dimension. + + Args: + grid_thw: [N, 3] tensor of [t, h, w] + + Returns: + [Σt, 3] tensor where each row is [1, h, w] + + Example: + >>> grid_thw = torch.tensor([[2, 3, 4], [1, 5, 6]]) + >>> split_thw(grid_thw) + tensor([[1, 3, 4], + [1, 3, 4], + [1, 5, 6]]) + """ + t = grid_thw[:, 0] + h_w = grid_thw[:, 1:] + ones = torch.ones_like(h_w[:, :1]) + return torch.cat([ones, h_w], dim=1).repeat_interleave(t, dim=0) + + +def get_num_patches(grid_thw: torch.Tensor, num_frames: Union[list[int], + torch.Tensor]): + """ + Return num_patches per video. + + Args: + t: tensor with shape [N, ...] where each item is a list/tensor + cu_seqlens: list indicating the boundaries of groups + + Returns: + list of ints representing the sum of products for each group + + Examples: + >>> # Suppose there are 2 videos with a total of 3 grids + >>> grid_thw = torch.tensor([[2, 2, 2], # grid 0: 2*2*2=8 patches + ... [2, 2, 2], # grid 1: 2*2*2=8 patches + ... [1, 1, 1]]) # grid 2: 1*1*1=1 patches + >>> num_frames = [2, 1] # The first video contains 2 grids, + the second contains 1 grid. + >>> get_num_patches(grid_thw, num_frames) + tensor([16, 1]) # Total patches for first video: 8+8=16, + second video: 1. + """ + + assert len(grid_thw.shape) == 2 + if isinstance(num_frames, torch.Tensor): + num_frames = num_frames.clone().tolist() + + num_grids_per_frame = grid_thw.prod(dim=1) + start_idx_per_video = [0, *itertools.accumulate(num_frames)] + num_patches = [ + num_grids_per_frame[start_idx_per_video[i]:start_idx_per_video[i + 1]]. + sum() for i in range(len(num_frames)) + ] + return torch.stack(num_patches) if num_patches else torch.zeros( + 0, dtype=grid_thw.dtype, device=grid_thw.device) + + +class KeyeVL1_5ImagePixelInputs(TensorSchema): + """ + Dimensions: + - b: Batch size + - np: Number of patches + - c: Number of channels + - ps: Patch size + - ni: Number of images + - g: Grid dimensions (3 for t, h, w) + """ + type: Literal["pixel_values"] + + pixel_values: Annotated[ + torch.Tensor, + TensorShape("np", 3, "ps", "ps", dynamic_dims={"np"})] + + image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)] + + +class KeyeVL1_5ImageEmbeddingInputs(TensorSchema): + """ + Dimensions: + - nf: Number of image features + - hs: Hidden size (must match the hidden size of language model + backbone) + - ni: Number of images + - g: Grid dimensions (3 for t, h, w) + """ + type: Literal["image_embeds"] + image_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")] + image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)] + + +KeyeVL1_5ImageInputs = Union[KeyeVL1_5ImagePixelInputs, + KeyeVL1_5ImageEmbeddingInputs] + + +class KeyeVL1_5VideoPixelInputs(TensorSchema): + """ + Dimensions: + - b: Batch size + - np: Number of patches + - c: Number of channels + - ps: Patch size + - ni: Number of images + - g: Grid dimensions (3 for t, h, w) + """ + type: Literal["pixel_values_videos"] + pixel_values_videos: Annotated[ + torch.Tensor, + TensorShape("np", 3, "ps", "ps", dynamic_dims={"np"})] + video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)] + + num_frames: torch.Tensor + + +class KeyeVL1_5VideoEmbeddingInputs(TensorSchema): + """ + Dimensions: + - nf: Number of video features + - hs: Hidden size (must match the hidden size of language model + backbone) + - nv: Number of videos + - g: Grid dimensions (3 for t, h, w) + """ + type: Literal["video_embeds"] + video_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")] + video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)] + num_frames: torch.Tensor + + +KeyeVL1_5VideoInputs = Union[KeyeVL1_5VideoPixelInputs, + KeyeVL1_5VideoEmbeddingInputs] + + +class KeyeVL1_5Projector(nn.Module): + + def __init__( + self, + text_config: PretrainedConfig, + vision_config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.text_config = text_config + self.vision_config = vision_config + self.merge_kernel_size = (2, 2) + + self.hidden_size = (self.vision_config.hidden_size * + self.merge_kernel_size[0] * + self.merge_kernel_size[1]) + + self.pre_norm = torch.nn.LayerNorm(self.hidden_size, eps=1e-05) + self.act = GELUActivation() + + self.linear_1 = ColumnParallelLinear( + self.hidden_size, + self.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.linear_1", + ) + self.linear_2 = RowParallelLinear( + self.hidden_size, + self.text_config.hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.linear_2", + ) + + def forward( + self, + image_features: Union[torch.Tensor, tuple[torch.Tensor], + list[torch.Tensor]], + image_grid_thw: list[tuple[int, int, int]], + ) -> Union[torch.Tensor, list[torch.Tensor]]: + m1, m2 = self.merge_kernel_size + if isinstance(image_features, (list, tuple)): + processed_features = list() + for image_feature, image_grid in zip(image_features, + image_grid_thw): + t, h, w = image_grid + image_feature = rearrange( + image_feature, + "(t h p1 w p2) d -> (t h w) (p1 p2 d)", + t=t, + h=h // m1, + p1=m1, + w=w // m2, + p2=m2, + ) + image_feature = self.pre_norm(image_feature) + hidden_states, _ = self.linear_1(image_feature) + hidden_states = self.act(hidden_states) + hidden_states, _ = self.linear_2(hidden_states) + processed_features.append(hidden_states) + + return processed_features + + dims = image_features.shape[:-1] + dim = image_features.shape[-1] + image_features = image_features.view(np.prod(dims), dim) + hidden_states = self.pre_norm(image_features.view( + -1, self.hidden_size)) + hidden_states = self.linear_1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + + return hidden_states.view(*dims, -1) + + +class KeyeVL1_5ProcessingInfo(KeyeProcessingInfo): + + def get_max_frame_per_video(self) -> int: + return 2048 + + def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]: + return {"image": None, "video": 1} + + +def _keye_field_config(hf_inputs: Mapping[str, torch.Tensor], ): + image_grid_thw = hf_inputs.get("image_grid_thw", + torch.empty((0, 3), dtype=torch.int64)) + image_grid_sizes = image_grid_thw.prod(-1) + + video_grid_thw = hf_inputs.get("video_grid_thw", + torch.empty((0, 3), dtype=torch.int64)) + video_grid_thw = split_thw(video_grid_thw) + num_frames = hf_inputs.get("num_frames", + video_grid_thw[:, 0]).clone().tolist() + + video_num_patches = get_num_patches(video_grid_thw, num_frames) + + video_num_grids = [] + if len(num_frames) > 0: + i = 0 + j = 1 + cur_frames = num_frames[i] + for t, _, _ in video_grid_thw.tolist(): + cur_frames -= t + if cur_frames == 0: + video_num_grids.append(j) + i += 1 + if i < len(num_frames): + cur_frames = num_frames[i] + j = 1 + else: + j += 1 + video_num_grids = torch.tensor(video_num_grids) + return dict(pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes), + image_embeds=MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes), + image_grid_thw=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.flat_from_sizes( + "video", video_num_patches), + video_embeds=MultiModalFieldConfig.flat_from_sizes( + "video", video_num_patches), + video_grid_thw=MultiModalFieldConfig.flat_from_sizes( + "video", video_num_grids), + num_frames=MultiModalFieldConfig.batched("video")) + + +class KeyeVL1_5MultiModalDataParser(MultiModalDataParser): + + def _parse_image_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]], + ) -> ModalityDataItems[Any, Any]: + if isinstance(data, dict): + return DictEmbeddingItems( + data, + modality="image", + required_fields={ + "image_embeds", + "image_grid_thw", + }, + fields_factory=_keye_field_config, + ) + + return super()._parse_image_data(data) + + def _parse_video_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]], + ) -> ModalityDataItems[Any, Any]: + if isinstance(data, dict): + return DictEmbeddingItems( + data, + modality="video", + required_fields={ + "video_embeds", + "video_grid_thw", + }, + fields_factory=_keye_field_config, + ) + + return super()._parse_video_data(data) + + +class KeyeVL1_5MultiModalProcessor( + BaseMultiModalProcessor[KeyeVL1_5ProcessingInfo]): + + def _get_data_parser(self) -> MultiModalDataParser: + return KeyeVL1_5MultiModalDataParser() + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + image_processor = self.info.get_image_processor( + **hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + image_token_id = vocab[hf_processor.image_token] + video_token_id = vocab[hf_processor.video_token] + placeholder = {"image": image_token_id, "video": video_token_id} + merge_length = image_processor.merge_size**2 + + out_mm_kwargs_data = out_mm_kwargs.get_data() + frame_types: list[torch.Tensor] = \ + hf_processor_mm_kwargs.get("frame_types", None) + timestamps: list[torch.Tensor] = \ + hf_processor_mm_kwargs.get("timestamps", None) + num_videos = mm_items.get_count("video", strict=False) + + if frame_types is None: + frame_types = [None] * num_videos + assert len(frame_types) == num_videos, \ + f"Number of frame_types={len(frame_types)} " \ + f"doesn't equal to number of videos={num_videos}" + if timestamps is None: + timestamps = [None] * num_videos + assert len(timestamps) == num_videos, \ + f"Number of timestamps={len(timestamps)} " \ + f"doesn't equal to number of videos={num_videos}" + + video_grid_thw = out_mm_kwargs_data.get( + 'video_grid_thw', torch.empty((0, 3), dtype=torch.int64)) + num_frames = out_mm_kwargs_data.get( + 'num_frames', torch.tensor([], dtype=torch.int64)) + + assert len(num_frames) == num_videos, \ + f"Size of num_frames={len(num_frames)} " \ + f"doesn't equal to number of videos={num_videos}" + + video_grid_hws = split_thw(video_grid_thw) + assert int(num_frames.sum().tolist()) == video_grid_hws.shape[0], ( + f"The first dimension of `video_grid_hws`={video_grid_hws.shape[0]}" + f"doesn't equal to num of frames.") + + cu_seqlens = torch.cumsum(torch.tensor([0] + num_frames.tolist()), + dim=-1) + + def get_replacement_keye(item_idx: int, modality: str): + """ + Args: + item_idx(int): The item index of modality to replace + modality(str): The modality + """ + if modality == "image": + out_item = out_mm_kwargs[modality][item_idx] + grid_thw = out_item[f"{modality}_grid_thw"].data + assert isinstance(grid_thw, torch.Tensor) + + num_tokens = int(grid_thw.prod()) // merge_length + return [image_token_id] * num_tokens + elif modality == "video": + placeholders = [] + video_timestamps = timestamps[item_idx] + video_frame_types = frame_types[item_idx] + grid_thw = video_grid_hws[ + cu_seqlens[item_idx]:cu_seqlens[item_idx + 1]] + + nframes = grid_thw.shape[0] + + if video_timestamps is None: + video_timestamps = [""] * nframes + else: + video_timestamps = [ + format(ts, ".1f") for ts in video_timestamps + ] + + if video_frame_types is None: + video_frame_types = [0] * nframes + for i, sub_thw in enumerate(grid_thw): + s = f"{hf_processor.frame_token}{video_timestamps[i]}" + if video_frame_types[i] == 1: + s += hf_processor.fast_start + placeholders.extend(tokenizer.encode(s)) + num_frame_tokens = int(sub_thw.prod()) // merge_length + placeholders.extend([video_token_id] * num_frame_tokens) + if video_frame_types[i] == 1: + placeholders.append(vocab[hf_processor.fast_end]) + + return PromptUpdateDetails.select_token_id( + placeholders, embed_token_id=video_token_id) + else: + raise ValueError(f"Unsupported modality {modality}") + + return [ + PromptReplacement( + modality=modality, + target=[placeholder[modality]], + replacement=partial(get_replacement_keye, modality=modality), + ) for modality in ("image", "video") + ] + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return _keye_field_config(hf_inputs) + + +class KeyeVL1_5DummyInputsBuilder( + KeyeBaseDummyInputsBuilder[KeyeVL1_5ProcessingInfo]): + ... + + +@MULTIMODAL_REGISTRY.register_processor( + KeyeVL1_5MultiModalProcessor, + info=KeyeVL1_5ProcessingInfo, + dummy_inputs=KeyeVL1_5DummyInputsBuilder, +) +class KeyeVL1_5ForConditionalGeneration(BaseKeyeModule, SupportsMultiModal, + SupportsLoRA, SupportsPP): + + def _build_projector(self, + text_config: PretrainedConfig, + vision_config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> nn.Module: + return KeyeVL1_5Projector(text_config, vision_config, quant_config, + prefix) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config: PretrainedConfig = vllm_config.model_config.hf_config + self.merge_size = config.vision_config.spatial_merge_size + super().__init__(vllm_config=vllm_config, prefix=prefix) + + def _validate_and_reshape_mm_tensor(self, mm_input: NestedTensors, + expected_dim: int, name: str): + if not isinstance(mm_input, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of {name}. " + f"Got type: {type(mm_input)}") + if isinstance(mm_input, torch.Tensor): + if mm_input.ndim == expected_dim: + return mm_input + elif mm_input.ndim == expected_dim + 1: + return torch.concat(list(mm_input)) + else: + raise ValueError( + f"{name} should be {expected_dim}D or " + f"batched {expected_dim}D tensor." + f"Got ndim: {mm_input.ndim} (shape={mm_input.shape})") + else: + return torch.concat(list(mm_input)) + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[KeyeVL1_5ImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + pixel_values = self._validate_and_reshape_mm_tensor( + pixel_values, expected_dim=4, name="image pixel values") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, expected_dim=2, name="image grid_thw") + + return KeyeVL1_5ImagePixelInputs( + type="pixel_values", + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + ) + + if image_embeds is not None: + image_embeds = self._validate_and_reshape_mm_tensor( + image_embeds, expected_dim=2, name="image embeds") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, expected_dim=2, name="image grid_thw") + + return KeyeVL1_5ImageEmbeddingInputs( + type="image_embeds", + image_embeds=image_embeds, + image_grid_thw=image_grid_thw, + ) + + def _parse_and_validate_video_input( + self, **kwargs: object) -> Optional[KeyeVL1_5VideoInputs]: + pixel_values_videos = kwargs.pop("pixel_values_videos", None) + video_embeds = kwargs.pop("video_embeds", None) + video_grid_thw = kwargs.pop("video_grid_thw", None) + num_frames = kwargs.pop("num_frames", None) + + if pixel_values_videos is None and video_embeds is None: + return None + + if pixel_values_videos is not None: + pixel_values_videos = self._validate_and_reshape_mm_tensor( + pixel_values_videos, + expected_dim=4, + name="video pixel values", + ) + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, expected_dim=2, name="video grid_thw") + + num_frames = self._validate_and_reshape_mm_tensor( + num_frames, expected_dim=1, name="video num frames") + + return KeyeVL1_5VideoPixelInputs( + type="pixel_values_videos", + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + num_frames=num_frames) + + if video_embeds is not None: + video_embeds = self._validate_and_reshape_mm_tensor( + video_embeds, expected_dim=2, name="video embeds") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, expected_dim=2, name="video grid_thw") + + return KeyeVL1_5VideoEmbeddingInputs(type="video_embeds", + video_embeds=video_embeds, + video_grid_thw=video_grid_thw, + num_frames=num_frames) + + def _process_video_input( + self, + video_input: KeyeVL1_5VideoInputs) -> tuple[torch.Tensor, ...]: + video_type = video_input["type"] + video_grid_thw = split_thw(video_input["video_grid_thw"]) + pixel_values_videos = video_input.get("pixel_values_videos", None) + + video_embeds = self._process_video_embeds(video_type, video_grid_thw, + pixel_values_videos) + video_embeds = torch.concat(video_embeds, dim=0) + + num_frames = video_input["num_frames"].clone().tolist() + + num_patches = get_num_patches(video_grid_thw, num_frames).tolist() + + patch_cu_seqlens = torch.cumsum( + torch.tensor([0] + num_patches).detach().clone(), dim=-1) + patch_cu_seqlens = torch.div(patch_cu_seqlens, + self.merge_size**2, + rounding_mode="floor") + + new_video_embeds = [] + for idx in range(patch_cu_seqlens.shape[0] - 1): + start = patch_cu_seqlens[idx] + end = patch_cu_seqlens[idx + 1] + new_video_embeds.append(video_embeds[start:end]) + return tuple(new_video_embeds) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 98115f8623563..edb7f24214406 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -227,6 +227,7 @@ _MULTIMODAL_MODELS = { "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501 "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"), + "KeyeVL1_5ForConditionalGeneration": ("keye_vl1_5", "KeyeVL1_5ForConditionalGeneration"), # noqa: E501 "RForConditionalGeneration": ("rvl", "RForConditionalGeneration"), "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501 "Llama_Nemotron_Nano_VL": ("nemotron_vl", "LlamaNemotronVLChatModel"), From 41c80698b3849969dcb5c5e40d0991b0eb4821cc Mon Sep 17 00:00:00 2001 From: Julien Debache Date: Mon, 1 Sep 2025 15:28:26 +0200 Subject: [PATCH 07/95] Document multi-proc method selection for profiling (#23802) Signed-off-by: jdebache --- docs/contributing/profiling.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 74627e9062167..dffd62385e017 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -73,6 +73,8 @@ apt install nsight-systems-cli ### Example commands and usage +When profiling with `nsys`, it is advisable to set the environment variable `VLLM_WORKER_MULTIPROC_METHOD=spawn`. The default is to use the `fork` method instead of `spawn`. More information on the topic can be found in the [Nsight Systems release notes](https://docs.nvidia.com/nsight-systems/ReleaseNotes/index.html#general-issues). + #### Offline Inference For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference. From 39a22dcaac707ebc6c79bfbfc12d6375a2094f38 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 1 Sep 2025 08:54:01 -0700 Subject: [PATCH 08/95] [Misc] Minor code simplification for spec decode (#24053) Signed-off-by: Woosuk Kwon --- vllm/v1/core/sched/scheduler.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d4391b1c2137a..e07d53ff84d37 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -873,19 +873,19 @@ class Scheduler(SchedulerInterface): scheduled_spec_token_ids = ( scheduler_output.scheduled_spec_decode_tokens.get(req_id)) if scheduled_spec_token_ids: + num_draft_tokens = len(scheduled_spec_token_ids) + num_accepted = len(generated_token_ids) - 1 + num_rejected = num_draft_tokens - num_accepted # num_computed_tokens represents the number of tokens # processed in the current step, considering scheduled # tokens and rejections. If some tokens are rejected, # num_computed_tokens is decreased by the number of rejected - # tokens, where is given by: - # len(scheduled_spec_token_ids) + 1 - len(generated_token_ids). - num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 - - len(generated_token_ids)) - request.num_computed_tokens -= num_tokens_rejected + # tokens. + request.num_computed_tokens -= num_rejected spec_decoding_stats = self.make_spec_decoding_stats( spec_decoding_stats, - num_draft_tokens=len(scheduled_spec_token_ids), - num_accepted_tokens=len(generated_token_ids) - 1) + num_draft_tokens=num_draft_tokens, + num_accepted_tokens=num_accepted) stopped = False new_logprobs = None From cf91a89dd2b89f92e5877d0d7b4c32c70da6f3c1 Mon Sep 17 00:00:00 2001 From: Christian Pinto Date: Mon, 1 Sep 2025 17:17:41 +0100 Subject: [PATCH 09/95] [docs][misc] IOProcessor plugins fixes (#24046) Signed-off-by: Christian Pinto --- docs/design/io_processor_plugins.md | 4 ++-- .../online_serving/prithvi_geospatial_mae.py | 1 + .../prithvi_io_processor/prithvi_processor.py | 19 +------------------ .../test_io_processor_plugins.py | 1 + vllm/entrypoints/openai/protocol.py | 3 ++- vllm/plugins/io_processors/interface.py | 9 +++++++-- 6 files changed, 14 insertions(+), 23 deletions(-) diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index 8e5d5249409c6..ee474b5a7b997 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -64,9 +64,9 @@ The `parse_request` method is used for validating the user prompt and converting The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference. The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output. -The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is [here](../../vllm/entrypoints/openai/serving_pooling_with_io_plugin.py). +The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here . -An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our [online](../../examples/online_serving/prithvi_geospatial_mae.py) and [offline](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py) inference examples. +An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online () and offline () inference examples. ## Using an IO Processor plugin diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/prithvi_geospatial_mae.py index cbd34f461362c..31301e0042cf4 100644 --- a/examples/online_serving/prithvi_geospatial_mae.py +++ b/examples/online_serving/prithvi_geospatial_mae.py @@ -33,6 +33,7 @@ def main(): }, "priority": 0, "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM", + "softmax": False, } ret = requests.post(server_endpoint, json=request_payload_url) diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py index d49a50b7a309f..0ebaafda94dc5 100644 --- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py +++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py @@ -8,7 +8,7 @@ import datetime import os import tempfile import urllib.request -from collections.abc import AsyncGenerator, Sequence +from collections.abc import Sequence from typing import Any, Optional, Union import albumentations @@ -359,14 +359,6 @@ class PrithviMultimodalDataProcessor(IOProcessor): return prompts - async def pre_process_async( - self, - prompt: IOProcessorInput, - request_id: Optional[str] = None, - **kwargs, - ) -> Union[PromptType, Sequence[PromptType]]: - return self.pre_process(prompt, request_id, **kwargs) - def post_process( self, model_output: Sequence[PoolingRequestOutput], @@ -421,15 +413,6 @@ class PrithviMultimodalDataProcessor(IOProcessor): data=out_data, request_id=request_id) - async def post_process_async( - self, - model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]], - request_id: Optional[str] = None, - **kwargs, - ) -> IOProcessorOutput: - collected_output = [item async for i, item in model_output] - return self.post_process(collected_output, request_id, **kwargs) - class PrithviMultimodalDataProcessorIndia(PrithviMultimodalDataProcessor): diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py index 00fe429445d7d..b2fbef2ee25cb 100644 --- a/tests/plugins_tests/test_io_processor_plugins.py +++ b/tests/plugins_tests/test_io_processor_plugins.py @@ -113,6 +113,7 @@ async def test_prithvi_mae_plugin_online( }, "priority": 0, "model": model_name, + "softmax": False } ret = requests.post( diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 00b72f74cec85..30c3a82696155 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1424,9 +1424,10 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]): When using plugins IOProcessor plugins, the actual input is processed by the plugin itself. Hence, we use a generic type for the request data """ + softmax: bool = True def to_pooling_params(self): - return PoolingParams(task="encode") + return PoolingParams(task="encode", softmax=self.softmax) class IOProcessorResponse(OpenAIBaseModel, Generic[T]): diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py index 5c73188d5df51..62b224cac5e53 100644 --- a/vllm/plugins/io_processors/interface.py +++ b/vllm/plugins/io_processors/interface.py @@ -49,7 +49,12 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): request_id: Optional[str] = None, **kwargs, ) -> IOProcessorOutput: - collected_output = [item async for i, item in model_output] + # We cannot guarantee outputs are returned in the same order they were + # fed to vLLM. + # Let's sort them by id before post_processing + sorted_output = sorted([(i, item) async for i, item in model_output], + key=lambda output: output[0]) + collected_output = [output[1] for output in sorted_output] return self.post_process(collected_output, request_id, **kwargs) @abstractmethod @@ -59,4 +64,4 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): @abstractmethod def output_to_response( self, plugin_output: IOProcessorOutput) -> IOProcessorResponse: - raise NotImplementedError \ No newline at end of file + raise NotImplementedError From a0e0efd6bdcfd071e5f4123319628887dfd4973d Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Tue, 2 Sep 2025 00:56:56 +0800 Subject: [PATCH 10/95] [Model] Support DP for ViT on Kimi-VL-A3B-Thinking-2506 (#23817) Signed-off-by: Junhong Signed-off-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com> Co-authored-by: Junhong Co-authored-by: LJH-LBJ <98734602+LJH-LBJ@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com> --- docs/configuration/optimization.md | 1 + tests/multimodal/test_utils.py | 18 ++++-- vllm/model_executor/models/kimi_vl.py | 52 +++++++++++----- vllm/model_executor/models/moonvit.py | 77 +++++++++++++++++------- vllm/model_executor/models/qwen2_5_vl.py | 12 ++-- vllm/multimodal/utils.py | 57 +++++++++++++----- 6 files changed, 156 insertions(+), 61 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 2d8cdcc11fa99..b0ea9621d545a 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -174,6 +174,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u Known supported models: +- Kimi-VL () - Llama4 () - MiniCPM-V-2.5 or above (, ) - Qwen2.5-VL () diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index a028c668c8ab7..05e68a961a548 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -636,8 +636,10 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int, # Run the model through the sharded function with torch.inference_mode(): - sharded_output = run_dp_sharded_mrope_vision_model( - vision_model, pixel_values, grid_thw_list) + sharded_output = run_dp_sharded_mrope_vision_model(vision_model, + pixel_values, + grid_thw_list, + rope_type="rope_3d") sharded_output = torch.cat(sharded_output, dim=0) # Check that the world size is setup correctly @@ -691,8 +693,10 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker( # Should handle empty input gracefully with torch.inference_mode(): - output = run_dp_sharded_mrope_vision_model(vision_model, pixel_values, - grid_thw_list) + output = run_dp_sharded_mrope_vision_model(vision_model, + pixel_values, + grid_thw_list, + rope_type="rope_3d") assert len(output) == 0 @@ -745,8 +749,10 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker( # Should handle uneven distribution without errors with torch.inference_mode(): - output_tuple = run_dp_sharded_mrope_vision_model( - vision_model, pixel_values, grid_thw_list) + output_tuple = run_dp_sharded_mrope_vision_model(vision_model, + pixel_values, + grid_thw_list, + rope_type="rope_3d") # Verify output shape is reasonable merge_factor = vision_model.spatial_merge_size**2 diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index a08a9a62a57c5..4f76d4afdb20e 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -56,6 +56,7 @@ from transformers.activations import GELUActivation from vllm.config import VllmConfig from vllm.distributed import get_pp_group from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) @@ -76,6 +77,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config @@ -93,8 +95,10 @@ class MaxImageTokenMeta: class KimiVLMultiModalProjector(nn.Module): - def __init__(self, config: KimiVLConfig): + def __init__(self, config: KimiVLConfig, \ + use_data_parallel: bool = False, prefix: str = ""): super().__init__() + self.use_data_parallel = use_data_parallel self.hidden_size = (config.vision_config.hidden_size * config.vision_config.merge_kernel_size[0] * @@ -102,20 +106,24 @@ class KimiVLMultiModalProjector(nn.Module): self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size, eps=1e-5) - self.linear_1 = nn.Linear(self.hidden_size, - self.hidden_size, - bias=True) + self.linear_1 = ReplicatedLinear(self.hidden_size, + self.hidden_size, + bias=True, + prefix=maybe_prefix( + prefix, "linear_1")) + self.linear_2 = ReplicatedLinear(self.hidden_size, + config.text_config.hidden_size, + bias=True, + prefix=maybe_prefix( + prefix, "linear_2")) self.act = GELUActivation() - self.linear_2 = nn.Linear(self.hidden_size, - config.text_config.hidden_size, - bias=True) def forward(self, image_features: torch.Tensor) -> torch.Tensor: hidden_states = self.pre_norm(image_features).view( -1, self.hidden_size) - hidden_states = self.linear_1(hidden_states) + hidden_states, _ = self.linear_1(hidden_states) hidden_states = self.act(hidden_states) - hidden_states = self.linear_2(hidden_states) + hidden_states, _ = self.linear_2(hidden_states) return hidden_states @@ -273,6 +281,8 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]): class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + supports_encoder_tp_data = True + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): @@ -292,10 +302,17 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, quant_config = vllm_config.quant_config assert isinstance(config.vision_config, MoonViTConfig) + self.use_data_parallel = model_config.multimodal_config.mm_encoder_tp_mode == "data" + self.hidden_size = config.text_config.hidden_size + self.vision_tower = MoonVitPretrainedModel(config.vision_config, + self.use_data_parallel, + prefix=maybe_prefix( + prefix, "vision_tower")) - self.vision_tower = MoonVitPretrainedModel(config.vision_config) - - self.multi_modal_projector = KimiVLMultiModalProjector(config=config) + self.multi_modal_projector = KimiVLMultiModalProjector( + config=config, + use_data_parallel=self.use_data_parallel, + prefix=maybe_prefix(prefix, "multi_modal_projector")) self.quant_config = quant_config sub_vllm_config = copy.deepcopy(vllm_config) @@ -376,13 +393,19 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, pixel_values = inputs["pixel_values"] image_grid_hws = inputs["image_grid_hws"] - return self.vision_tower(pixel_values, image_grid_hws) + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model(self.vision_tower, + pixel_values, + image_grid_hws.tolist(), + rope_type="rope_2d") + else: + return self.vision_tower(pixel_values, image_grid_hws) def _process_image_input(self, image_input: KimiVLImageInputs) -> torch.Tensor: assert image_input["type"] == "pixel_values" image_features = self._process_image_pixels(image_input) - assert isinstance(image_features, list) + assert isinstance(image_features, (list, tuple)) lengths = [x.shape[0] for x in image_features] return self.multi_modal_projector( torch.cat(image_features)).split(lengths) @@ -496,6 +519,7 @@ class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal, expert_params_mapping = [] params_dict = dict(self.named_parameters()) + for args in weights: name, loaded_weight = args[:2] kwargs = args[2] if len(args) > 2 else {} diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index d0fdab13ef0c9..41a2c836b09f3 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -42,7 +42,6 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import math from collections.abc import Sequence from copy import deepcopy from functools import cached_property @@ -55,6 +54,8 @@ from transformers.activations import ACT2FN, PytorchGELUTanh from transformers.modeling_utils import PreTrainedModel from transformers.utils import is_flash_attn_2_available +from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.models.utils import maybe_prefix from vllm.transformers_utils.configs.moonvit import MoonViTConfig if is_flash_attn_2_available(): @@ -383,21 +384,30 @@ class MLP2(nn.Module): bias: whether to use bias in linear layer. """ - def __init__(self, dims: list[int], activation, bias=True): + def __init__(self, + dims: list[int], + activation, + bias=True, + prefix: str = "", + use_data_parallel: bool = False): super().__init__() assert len(dims) == 3 - self.fc0 = nn.Linear(dims[0], dims[1], bias=bias) - self.fc1 = nn.Linear(dims[1], dims[2], bias=bias) + self.use_data_parallel = use_data_parallel + self.fc0 = ReplicatedLinear(dims[0], + dims[1], + bias=bias, + prefix=maybe_prefix(prefix, "fc0")) + self.fc1 = ReplicatedLinear(dims[1], + dims[2], + bias=bias, + prefix=maybe_prefix(prefix, "fc1")) self.activation = activation - for m in [self.fc0, self.fc1]: - nn.init.trunc_normal_(m.weight, std=math.sqrt(2 / m.in_features)) - if m.bias is not None: - nn.init.zeros_(m.bias) def forward(self, x: torch.Tensor) -> torch.Tensor: - x = self.fc0(x) + x, _ = self.fc0(x) x = self.activation(x) - return self.fc1(x) + x, _ = self.fc1(x) + return x class MoonVitEncoderLayer(nn.Module): @@ -407,6 +417,8 @@ class MoonVitEncoderLayer(nn.Module): num_heads: int, hidden_dim: int, mlp_dim: int, + prefix: str = "", + use_data_parallel: bool = False, *, attn_implementation: str = "sdpa", activation=F.gelu, @@ -423,9 +435,19 @@ class MoonVitEncoderLayer(nn.Module): self.norm0 = nn.LayerNorm(hidden_dim) self.norm1 = nn.LayerNorm(hidden_dim) - self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim], activation) - self.wqkv = nn.Linear(hidden_dim, hidden_dim * 3, bias=attn_bias) - self.wo = nn.Linear(hidden_dim, hidden_dim, bias=attn_bias) + self.use_data_parallel = use_data_parallel + self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim], + activation, + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel) + self.wqkv = ReplicatedLinear(hidden_dim, + hidden_dim * 3, + bias=attn_bias, + prefix=f"{prefix}.wqkv") + self.wo = ReplicatedLinear(hidden_dim, + hidden_dim, + bias=attn_bias, + prefix=f"{prefix}.wo") def attention_qkvpacked( self, @@ -438,7 +460,7 @@ class MoonVitEncoderLayer(nn.Module): x (torch.Tensor): (batch_size, seqlen, hidden_dim) cu_seqlens (torch.Tensor): """ - xqkv = self.wqkv(x) + xqkv, _ = self.wqkv(x) qkv_shape = xqkv.size()[:-1] + ( 3, @@ -457,8 +479,7 @@ class MoonVitEncoderLayer(nn.Module): xv, q_cu_seqlens=cu_seqlens, k_cu_seqlens=cu_seqlens) - - attn_out = self.wo(attn_out) + attn_out, _ = self.wo(attn_out) return attn_out def forward( @@ -494,13 +515,17 @@ class MoonVitEncoder(nn.Module): hidden_dim: int, num_layers: int, block_cfg: dict, + prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() self.rope_2d = Rope2DPosEmb( block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512) self.blocks = nn.ModuleList( - [MoonVitEncoderLayer(**block_cfg) for _ in range(num_layers)]) + [MoonVitEncoderLayer(use_data_parallel=use_data_parallel, \ + prefix=f"{prefix}.blocks.{layer_idx}", \ + **block_cfg) for layer_idx in range(num_layers)]) self.final_layernorm = nn.LayerNorm(hidden_dim) def forward(self, hidden_states: torch.Tensor, @@ -508,10 +533,9 @@ class MoonVitEncoder(nn.Module): rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens( grid_hws=grid_hw) - lengths = torch.cat(( - torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype), - grid_hw[:, 0] * grid_hw[:, 1], - )) + lengths = torch.cat( + (torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype), + (grid_hw[:, 0] * grid_hw[:, 1]).to(hidden_states.device))) cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32) for _, block in enumerate(self.blocks): @@ -587,11 +611,19 @@ class MoonVitPretrainedModel(PreTrainedModel): _supports_flash_attn_2 = True _supports_sdpa = True - def __init__(self, config: MoonViTConfig, *inputs, **kwargs): + def __init__(self, + config: MoonViTConfig, + use_data_parallel: bool = False, + prefix: str = "", + *inputs, + **kwargs): super().__init__(config, *inputs, **kwargs) config = deepcopy(config) + self.use_data_parallel = use_data_parallel self.merge_kernel_size = config.merge_kernel_size + self.hidden_size = config.hidden_size self.patch_size = config.patch_size + self.vit_processing_type = "rope_2d" self.patch_embed = MoonVisionPatchEmbed( out_dim=config.hidden_size, patch_size=config.patch_size, @@ -610,6 +642,7 @@ class MoonVitPretrainedModel(PreTrainedModel): "attn_bias": True, "attn_implementation": config._attn_implementation, }, + prefix=f"{prefix}.encoder", ) def forward(self, pixel_values: torch.Tensor, diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index b528083b7c9cc..c8f7fc16b4e83 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1021,8 +1021,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, pixel_values = image_input["pixel_values"] if self.use_data_parallel: - return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values, grid_thw_list) + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d") else: image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) @@ -1048,8 +1050,10 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, else: pixel_values_videos = video_input["pixel_values_videos"] if self.use_data_parallel: - return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values_videos, grid_thw_list) + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values_videos, + grid_thw_list, + rope_type="rope_3d") else: video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw_list) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 834b2189e4bed..ac967dcc4003e 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -9,7 +9,7 @@ from collections.abc import Iterable from concurrent.futures import ThreadPoolExecutor from itertools import groupby from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Any, Literal, Optional, TypeVar, Union from urllib.parse import ParseResult, urlparse from urllib.request import url2pathname @@ -444,7 +444,6 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor, Args: image_input (torch.Tensor): Image input tensor. vision_model (torch.nn.Module): Vision model. - Returns: torch.Tensor: Output image embeddings """ @@ -542,6 +541,8 @@ def run_dp_sharded_mrope_vision_model( vision_model: torch.nn.Module, pixel_values: torch.Tensor, grid_thw_list: list[list[int]], + *, + rope_type: Literal["rope_3d", "rope_2d"], ) -> tuple[torch.Tensor, ...]: """Run a vision model with data parallelism (DP) sharding. The function will shard the input image tensor on the @@ -552,6 +553,10 @@ def run_dp_sharded_mrope_vision_model( vision_model (torch.nn.Module): Vision model. pixel_values (torch.Tensor): Image/Video input tensor. grid_thw_list: List of grid dimensions for each image + rope_type: Type of rope used in the vision model. + Different rope types have different dimension to do ViT. + "rope_3d" for 3D rope (e.g., Qwen2.5-VL) + "rope_2d" for 2D rope (e.g., Kimi-VL) Returns: torch.Tensor: Output image embeddings @@ -605,8 +610,12 @@ def run_dp_sharded_mrope_vision_model( device=pixel_values.device, dtype=pixel_values.dtype) # embed_dim_reduction_factor = 2 * 2 - embed_dim_reduction_factor = (vision_model.spatial_merge_size * - vision_model.spatial_merge_size) + if rope_type == "rope_2d": + embed_dim_reduction_factor = (vision_model.merge_kernel_size[0] * + vision_model.merge_kernel_size[1]) + else: + embed_dim_reduction_factor = (vision_model.spatial_merge_size * + vision_model.spatial_merge_size) # Find the max length across all ranks # The output embedding of every DP rank has to be @@ -617,23 +626,42 @@ def run_dp_sharded_mrope_vision_model( local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local] # Run the vision model on the local pixel_values_local - if pixel_values_local.shape[0] > 0: - image_embeds_local = vision_model(pixel_values_local, - local_grid_thw_list) + if rope_type == "rope_2d": + if pixel_values_local.shape[0] > 0: + image_embeds_local = vision_model( + pixel_values_local, torch.tensor(local_grid_thw_list)) + if isinstance(image_embeds_local, list): + image_embeds_local = torch.cat(image_embeds_local, dim=0) + else: + out_dim = getattr(vision_model.config, "hidden_size", None) + image_embeds_local = torch.empty( + (0, embed_dim_reduction_factor, out_dim), + device=pixel_values.device, + dtype=pixel_values.dtype) else: - # Handle empty case - image_embeds_local = torch.empty((0, vision_model.out_hidden_size), - device=pixel_values.device, - dtype=pixel_values.dtype) + if pixel_values_local.shape[0] > 0: + image_embeds_local = vision_model(pixel_values_local, + local_grid_thw_list) + else: + # Handle empty case + image_embeds_local = torch.empty((0, vision_model.out_hidden_size), + device=pixel_values.device, + dtype=pixel_values.dtype) # Pad the output based on max_len_per_rank # for tensor_model_parallel_all_gather to work current_len = image_embeds_local.shape[0] if current_len < max_len_per_rank: padding_size = max_len_per_rank - current_len - padding = torch.empty((padding_size, image_embeds_local.shape[1]), - dtype=image_embeds_local.dtype, - device=image_embeds_local.device) + if rope_type == "rope_2d": + padding = torch.empty((padding_size, image_embeds_local.shape[1], + image_embeds_local.shape[2]), + dtype=image_embeds_local.dtype, + device=image_embeds_local.device) + else: + padding = torch.empty((padding_size, image_embeds_local.shape[1]), + dtype=image_embeds_local.dtype, + device=image_embeds_local.device) image_embeds_local_padded = torch.cat([image_embeds_local, padding], dim=0) else: @@ -674,7 +702,6 @@ def run_dp_sharded_mrope_vision_model( embed_start:embed_start + img_patches] embed_start += img_patches current_idx += count - out_embeddings = tuple(embed for embed in original_order_embeddings if embed is not None) assert len(out_embeddings) == len( From 5685370271d7f3e8222e26efb854e72e826b9af7 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 1 Sep 2025 12:07:53 -0700 Subject: [PATCH 11/95] [Chore][V0 Deprecation] Move LogProb to a separate file (#24055) Signed-off-by: Woosuk Kwon --- vllm/beam_search.py | 2 +- vllm/entrypoints/openai/protocol.py | 2 +- vllm/entrypoints/openai/serving_chat.py | 2 +- vllm/entrypoints/openai/serving_completion.py | 2 +- vllm/entrypoints/openai/serving_engine.py | 2 +- vllm/entrypoints/openai/serving_responses.py | 4 +-- vllm/logprobs.py | 28 +++++++++++++++++++ vllm/model_executor/layers/sampler.py | 4 +-- vllm/model_executor/model_loader/neuron.py | 4 +-- .../model_loader/neuronx_distributed.py | 4 +-- vllm/outputs.py | 5 ++-- vllm/sequence.py | 25 +---------------- vllm/transformers_utils/detokenizer.py | 5 ++-- vllm/v1/engine/logprobs.py | 2 +- 14 files changed, 49 insertions(+), 42 deletions(-) create mode 100644 vllm/logprobs.py diff --git a/vllm/beam_search.py b/vllm/beam_search.py index 5a2e79e1b5c74..01124872e98c0 100644 --- a/vllm/beam_search.py +++ b/vllm/beam_search.py @@ -4,8 +4,8 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional, Union +from vllm.logprobs import Logprob from vllm.lora.request import LoRARequest -from vllm.sequence import Logprob if TYPE_CHECKING: from vllm.multimodal import MultiModalDataDict diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 30c3a82696155..4881022325625 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -43,10 +43,10 @@ from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, from vllm.entrypoints.score_utils import (ScoreContentPartParam, ScoreMultiModalParam) from vllm.logger import init_logger +from vllm.logprobs import Logprob from vllm.pooling_params import PoolingParams from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, RequestOutputKind, SamplingParams) -from vllm.sequence import Logprob from vllm.utils import random_uuid, resolve_obj_by_qualname logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 6300d0758c3d4..35edd2f85cd07 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -43,10 +43,10 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( from vllm.entrypoints.utils import get_max_tokens from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger +from vllm.logprobs import Logprob from vllm.outputs import CompletionOutput, RequestOutput from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import BeamSearchParams, SamplingParams -from vllm.sequence import Logprob from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls, truncate_tool_call_ids, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 11effba8f9eb3..b26140d4b9d7a 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -38,9 +38,9 @@ from vllm.entrypoints.utils import get_max_tokens from vllm.inputs.data import (EmbedsPrompt, TokensPrompt, is_embeds_prompt, is_tokens_prompt) from vllm.logger import init_logger +from vllm.logprobs import Logprob from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams, SamplingParams -from vllm.sequence import Logprob from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import as_list, merge_async_iterators diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index b6a18760115a2..796b8ab5fc2cb 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -67,13 +67,13 @@ from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger +from vllm.logprobs import Logprob, PromptLogprobs from vllm.lora.request import LoRARequest from vllm.multimodal import ( # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin MultiModalDataDict, MultiModalUUIDDict) from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.sampling_params import BeamSearchParams, SamplingParams -from vllm.sequence import Logprob, PromptLogprobs from vllm.tracing import (contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 899cb07b2b37d..6a676cfe1b388 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -58,11 +58,11 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.tool_server import MCPToolServer, ToolServer from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger +from vllm.logprobs import Logprob as SampleLogprob +from vllm.logprobs import SampleLogprobs from vllm.outputs import CompletionOutput from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import SamplingParams -from vllm.sequence import Logprob as SampleLogprob -from vllm.sequence import SampleLogprobs from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import random_uuid diff --git a/vllm/logprobs.py b/vllm/logprobs.py new file mode 100644 index 0000000000000..e58ca142c00a4 --- /dev/null +++ b/vllm/logprobs.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass +from typing import Optional + + +# We use dataclass for now because it is used for +# openai server output, and msgspec is not serializable. +# TODO(sang): Fix it. +@dataclass +class Logprob: + """Infos for supporting OpenAI compatible logprobs and token ranks. + + Attributes: + logprob: The logprob of chosen token + rank: The vocab rank of chosen token (>=1) + decoded_token: The decoded chosen token index + """ + logprob: float + rank: Optional[int] = None + decoded_token: Optional[str] = None + + +# {token_id -> logprob} per each sequence group. None if the corresponding +# sequence group doesn't require prompt logprob. +PromptLogprobs = list[Optional[dict[int, Logprob]]] +# {token_id -> logprob} for each sequence group. +SampleLogprobs = list[dict[int, Logprob]] diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index e77eb637c8942..829dd82b0bd4d 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -13,14 +13,14 @@ import torch import torch.nn as nn import vllm.envs as envs +from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.model_executor.layers.utils import apply_penalties from vllm.model_executor.sampling_metadata import (SamplingMetadata, SamplingTensors, SequenceGroupToSample) from vllm.sampling_params import SamplingType from vllm.sequence import (VLLM_INVALID_TOKEN_ID, - CompletionSequenceGroupOutput, Logprob, - PromptLogprobs, SampleLogprobs, SequenceOutput) + CompletionSequenceGroupOutput, SequenceOutput) if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"): # yapf: disable diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index fad97aba84b6a..ee484e9a7b0a4 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -14,12 +14,12 @@ from transformers import PretrainedConfig from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig) +from vllm.logprobs import Logprob from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import get_quantization_config from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, - SequenceOutput) +from vllm.sequence import CompletionSequenceGroupOutput, SequenceOutput TORCH_DTYPE_TO_NEURON_AMP = { "auto": "f32", diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py index f450961c64ff4..34bf43fe7b57c 100644 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ b/vllm/model_executor/model_loader/neuronx_distributed.py @@ -27,11 +27,11 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig) from vllm.logger import init_logger +from vllm.logprobs import Logprob from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, - SequenceOutput) +from vllm.sequence import CompletionSequenceGroupOutput, SequenceOutput # yapf: enable logger = init_logger(__name__) diff --git a/vllm/outputs.py b/vllm/outputs.py index acdb2f89ce735..64bcfd472f2ad 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -11,11 +11,12 @@ import torch from typing_extensions import TypeVar from vllm.logger import init_logger +from vllm.logprobs import PromptLogprobs, SampleLogprobs from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalPlaceholderDict from vllm.sampling_params import RequestOutputKind -from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs, - SequenceGroup, SequenceGroupBase, SequenceStatus) +from vllm.sequence import (RequestMetrics, SequenceGroup, SequenceGroupBase, + SequenceStatus) logger = init_logger(__name__) diff --git a/vllm/sequence.py b/vllm/sequence.py index 7b48b7be9f511..4b8e1f4641f79 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -16,6 +16,7 @@ import msgspec import torch from vllm.inputs import SingletonInputs +from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict from vllm.pooling_params import PoolingParams from vllm.sampling_params import RequestOutputKind, SamplingParams @@ -38,30 +39,6 @@ def array_full(token_id: int, count: int): return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count -# We use dataclass for now because it is used for -# openai server output, and msgspec is not serializable. -# TODO(sang): Fix it. -@dataclass -class Logprob: - """Infos for supporting OpenAI compatible logprobs and token ranks. - - Attributes: - logprob: The logprob of chosen token - rank: The vocab rank of chosen token (>=1) - decoded_token: The decoded chosen token index - """ - logprob: float - rank: Optional[int] = None - decoded_token: Optional[str] = None - - -# {token_id -> logprob} per each sequence group. None if the corresponding -# sequence group doesn't require prompt logprob. -PromptLogprobs = list[Optional[dict[int, Logprob]]] -# {token_id -> logprob} for each sequence group. -SampleLogprobs = list[dict[int, Logprob]] - - class SequenceStatus(enum.IntEnum): """Status of a sequence.""" WAITING = 0 diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index 380c62a141f0f..56b01ecf78c46 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -3,8 +3,9 @@ from typing import Optional -from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams, - Sequence, SequenceGroup) +from vllm.logprobs import Logprob +from vllm.sequence import (VLLM_INVALID_TOKEN_ID, SamplingParams, Sequence, + SequenceGroup) from .detokenizer_utils import (convert_prompt_ids_to_tokens, detokenize_incrementally) diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py index 3de7fa6889e55..133122b6fcc0c 100644 --- a/vllm/v1/engine/logprobs.py +++ b/vllm/v1/engine/logprobs.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from typing import Optional from vllm.logger import init_logger -from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs +from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_ids_list_to_tokens) from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest From a344a5aa0a58cc1758d9721e848ce1f5ca4b6c7f Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Mon, 1 Sep 2025 14:09:37 -0700 Subject: [PATCH 12/95] [bugfix]fix MTP hidden states (#24056) Signed-off-by: Lu Fang --- vllm/v1/spec_decode/eagle.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 0a0e9fed725cb..bf25c91d8390c 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -220,6 +220,7 @@ class EagleProposer: ) if self.method in ("deepseek_mtp", "ernie_mtp"): last_hidden_states = ret_hidden_states + hidden_states = last_hidden_states else: last_hidden_states, hidden_states = ret_hidden_states sample_hidden_states = last_hidden_states[last_token_indices] From 0235103cbbdb511e6708aae600f759060a797c16 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Tue, 2 Sep 2025 04:07:45 +0200 Subject: [PATCH 13/95] [Doc]: fix typos in Python comments (#24042) Signed-off-by: Didier Durand Co-authored-by: Jee Jee Li --- vllm/distributed/device_communicators/quick_all_reduce.py | 2 +- vllm/distributed/device_communicators/ray_communicator.py | 2 +- vllm/entrypoints/openai/run_batch.py | 2 +- vllm/entrypoints/openai/serving_responses.py | 2 +- vllm/executor/ray_utils.py | 2 +- .../model_executor/layers/quantization/utils/quant_utils.py | 2 +- vllm/model_executor/models/registry.py | 2 +- vllm/model_executor/sampling_metadata.py | 2 +- vllm/scalar_type.py | 2 +- vllm/sequence.py | 2 +- vllm/v1/core/sched/scheduler.py | 6 +++--- vllm/v1/metrics/stats.py | 2 +- 12 files changed, 14 insertions(+), 14 deletions(-) diff --git a/vllm/distributed/device_communicators/quick_all_reduce.py b/vllm/distributed/device_communicators/quick_all_reduce.py index c61231e2d33f4..836241910e2fb 100644 --- a/vllm/distributed/device_communicators/quick_all_reduce.py +++ b/vllm/distributed/device_communicators/quick_all_reduce.py @@ -78,7 +78,7 @@ class QuickAllReduce: group: the process group to work on. If None, it will use the default process group. device: the device to bind the CustomAllreduce to. If None, - it will be bind to f"cuda:{local_rank}". + it will be bound to f"cuda:{local_rank}". It is the caller's responsibility to make sure each communicator is bind to a unique device, and all communicators in this group are in the same node. diff --git a/vllm/distributed/device_communicators/ray_communicator.py b/vllm/distributed/device_communicators/ray_communicator.py index 46cc1c2f52d67..8cd8c459a9e51 100644 --- a/vllm/distributed/device_communicators/ray_communicator.py +++ b/vllm/distributed/device_communicators/ray_communicator.py @@ -186,7 +186,7 @@ class RayPPCommunicator(Communicator): """ Receive a torch.Tensor from a peer and synchronize the current stream. - After this call returns, the receive buffer is safe to read from from + After this call returns, the receive buffer is safe to read from any stream. An RayChannelError will be raised if an error occurred (e.g., remote actor died), and the buffer is not safe to read. diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 01551a8c7f04a..fa813550e520c 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -161,7 +161,7 @@ async def write_local_file(output_path: str, batch_outputs: The list of batch outputs to write. """ # We should make this async, but as long as run_batch runs as a - # standalone program, blocking the event loop won't effect performance. + # standalone program, blocking the event loop won't affect performance. with open(output_path, "w", encoding="utf-8") as f: for o in batch_outputs: print(o.model_dump_json(), file=f) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 6a676cfe1b388..4c15de3030998 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -728,7 +728,7 @@ class OpenAIServingResponses(OpenAIServing): for response_msg in request.input: messages.append( parse_response_input(response_msg, prev_outputs)) - # User passes in a a tool call request and its output. We need + # User passes in a tool call request and its output. We need # to add the tool call request to prev_outputs so that the # parse_response_input can find the tool call request when # parsing the tool call output. diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 4b2a15afb67a7..0bdeb28569892 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -223,7 +223,7 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): """ # Wait until PG is ready - this will block until all - # requested resources are available, and will timeout + # requested resources are available, and will time out # if they cannot be provisioned. placement_group_specs = current_placement_group.bundle_specs diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 6154fca2e416d..f4ff875adb21c 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -116,7 +116,7 @@ def _normalize_quant_group_shape(x: torch.Tensor, group_shape: GroupShape): # then we would expand a to: # a = [[1, 1, 2, 2], # [3, 3, 4, 4]] -# NOTE this function this function does not explicitly broadcast dimensions +# NOTE this function does not explicitly broadcast dimensions # with an extent of 1, since this can be done implicitly by pytorch def group_broadcast(t, shape): for i, s in enumerate(shape): diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index edb7f24214406..f236040bb2341 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -185,7 +185,7 @@ _EMBEDDING_MODELS = { "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 # Technically PrithviGeoSpatialMAE is a model that works on images, both in - # input and output. I am adding it here because it piggy-backs on embedding + # input and output. I am adding it here because it piggybacks on embedding # models for the time being. "PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"), } diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 56f0f0984bfa0..2315f9dad5a5a 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -97,7 +97,7 @@ class SamplingMetadataCache: class SamplingMetadata: """Metadata for input sequences. Used in sampler. - The usage is as follow; + The usage is as follows; ``` hidden_states = execute_model(...) logits = hidden_states[sampling_metadata.selected_token_indices] diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py index 6f11ab8e0300a..055f28914ad59 100644 --- a/vllm/scalar_type.py +++ b/vllm/scalar_type.py @@ -269,7 +269,7 @@ class ScalarType: @classmethod def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType': - """Create a unsigned integer scalar type.""" + """Create an unsigned integer scalar type.""" ret = cls(0, size_bits, False, bias if bias else 0) ret.id # noqa B018: make sure the id is cached return ret diff --git a/vllm/sequence.py b/vllm/sequence.py index 4b8e1f4641f79..24114c0bb792e 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1193,7 +1193,7 @@ class HiddenStates(msgspec.Struct, array_like=True, seq_ids are the sequence ids of each entry of the batch dimension of the hidden_states tensor""" # Scorer hidden states. For prefill step, it is used for hidden states of - # all tokens, whereas for decode step, it use used for last accepted tokens. + # all tokens, whereas for decode step, it is used for last accepted tokens. hidden_states: torch.Tensor # The sequence group metadata list. Only needed for decode step. seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index e07d53ff84d37..8322fa7335b69 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -815,7 +815,7 @@ class Scheduler(SchedulerInterface): # NOTE: structured_output_request_ids maps # a request's (request that uses structured output) # request_id to its index in the batch. - # This will helps us determine to slice the grammar bitmask + # This will help us determine to slice the grammar bitmask # and only applies valid mask for requests that # uses structured decoding. structured_output_request_ids: dict[str, int] = {} @@ -923,7 +923,7 @@ class Scheduler(SchedulerInterface): request): # NOTE: structured_output_request # should not be None if use_structured_output, we have - # check above, so safe to ignore type warning + # checked above, so safe to ignore type warning request.structured_output_request.grammar.accept_tokens( # type: ignore[union-attr] req_id, new_token_ids) @@ -1242,7 +1242,7 @@ class Scheduler(SchedulerInterface): finished_sending reqs to the output. * if finished_sending: free the blocks # if finished_recving: add to state so we can - scheduler the request during the next step. + schedule the request during the next step. """ if self.connector is not None: diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 9a80460261e02..95094bda65cde 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -59,7 +59,7 @@ class RequestStateStats: num_generation_tokens: int = 0 - # This is a engine frontend timestamp (wall-clock) + # This is an engine frontend timestamp (wall-clock) arrival_time: float = 0.0 # These are engine core timestamps (monotonic) From 2b41cbbf030dd6cf4d5441fe679ca1c9add0d0e6 Mon Sep 17 00:00:00 2001 From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com> Date: Tue, 2 Sep 2025 06:53:00 +0300 Subject: [PATCH 14/95] [V1][Mamba1] - FP32 SSM Kernel Support (#23506) Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com> --- csrc/mamba/mamba_ssm/selective_scan_fwd.cu | 72 ++++++++++++------- .../models/language/generation/test_hybrid.py | 7 +- .../layers/mamba/mamba_utils.py | 18 +++-- 3 files changed, 65 insertions(+), 32 deletions(-) diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index c4ddbc142791f..d534e138d26d6 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -27,11 +27,12 @@ template + bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_, typename state_t_> struct Selective_Scan_fwd_kernel_traits { static_assert(kNItems_ % 4 == 0); using input_t = input_t_; using weight_t = weight_t_; + using state_t = state_t_; static constexpr int kNThreads = kNThreads_; // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy. static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3; @@ -132,7 +133,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { input_t *Bvar = reinterpret_cast(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride; weight_t *C = reinterpret_cast(params.C_ptr) + dim_id * kNRows * params.C_d_stride; input_t *Cvar = reinterpret_cast(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride; - input_t *ssm_states = reinterpret_cast(params.ssm_states_ptr) + + typename Ktraits::state_t *ssm_states = reinterpret_cast(params.ssm_states_ptr) + cache_index * params.ssm_states_batch_stride + dim_id * kNRows * params.ssm_states_dim_stride; @@ -261,7 +262,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { if (threadIdx.x == 0) { smem_running_prefix[state_idx] = prefix_op.running_prefix; if (chunk == n_chunks - 1) { - ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y); + ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y); } } #pragma unroll @@ -310,7 +311,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { } } -template +template void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block // processing 1 row. @@ -321,7 +322,7 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] { BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] { BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] { - using Ktraits = Selective_Scan_fwd_kernel_traits; + using Ktraits = Selective_Scan_fwd_kernel_traits; constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t); dim3 grid(params.batch, params.dim / kNRows); auto kernel = &selective_scan_fwd_kernel; @@ -341,59 +342,78 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { }); } -template +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream) { #ifndef USE_ROCM if (params.seqlen <= 128) { - selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 256) { - selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 512) { - selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<32, 16, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 1024) { - selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream); } else { - selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream); } #else if (params.seqlen <= 256) { - selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 512) { - selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream); } else if (params.seqlen <= 1024) { - selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream); } else { - selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream); + selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream); } #endif } -template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); -template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); -template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")") -#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...) \ +#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, STYPE, NAME, ...) \ if (ITYPE == at::ScalarType::Half) { \ using input_t = at::Half; \ using weight_t = float; \ - __VA_ARGS__(); \ + if (STYPE == at::ScalarType::Half) { \ + using state_t = at::Half; \ + __VA_ARGS__(); \ + } else if (STYPE == at::ScalarType::Float) { \ + using state_t = float; \ + __VA_ARGS__(); \ + } else { \ + AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \ + } \ } else if (ITYPE == at::ScalarType::BFloat16) { \ using input_t = at::BFloat16; \ using weight_t = float; \ - __VA_ARGS__(); \ + if (STYPE == at::ScalarType::BFloat16) { \ + using state_t = at::BFloat16; \ + __VA_ARGS__(); \ + } else if (STYPE == at::ScalarType::Float) { \ + using state_t = float; \ + __VA_ARGS__(); \ + } else { \ + AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \ + } \ } else if (ITYPE == at::ScalarType::Float) { \ using input_t = float; \ using weight_t = float; \ + using state_t = float; \ __VA_ARGS__(); \ } else { \ AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \ } -template +template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); void set_ssm_params_fwd(SSMParamsBase ¶ms, @@ -648,7 +668,9 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout at::Tensor out = delta; - TORCH_CHECK(ssm_states.scalar_type() == input_type); + // ssm_states can now be either the same as input_type or float32 + auto state_type = ssm_states.scalar_type(); + TORCH_CHECK(state_type == input_type || state_type == at::ScalarType::Float); TORCH_CHECK(ssm_states.is_cuda()); TORCH_CHECK(ssm_states.stride(-1) == 1); @@ -670,7 +692,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, const at::cuda::OptionalCUDAGuard device_guard(device_of(u)); auto stream = at::cuda::getCurrentCUDAStream().stream(); - DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] { - selective_scan_fwd_cuda(params, stream); + DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), ssm_states.scalar_type(), "selective_scan_fwd", [&] { + selective_scan_fwd_cuda(params, stream); }); } diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 31ca3a6f0f985..3cacbdcfbe86e 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -65,6 +65,11 @@ V0_UNSUPPORTED_MODELS = [ "LiquidAI/LFM2-1.2B", ] +FP32_STATE_MODELS = [ + "state-spaces/mamba-130m-hf", + "Zyphra/Zamba2-1.2B-instruct", +] + # Avoid OOM MAX_NUM_SEQS = 4 @@ -434,7 +439,7 @@ def test_full_cuda_graph( ) -@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"]) +@pytest.mark.parametrize("model", FP32_STATE_MODELS) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) def test_fp32_state( diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py index 280a9e45e662e..1dc46639640b0 100644 --- a/vllm/model_executor/layers/mamba/mamba_utils.py +++ b/vllm/model_executor/layers/mamba/mamba_utils.py @@ -30,12 +30,8 @@ class MambaStateDtypeCalculator: mamba_cache_dtype: MambaDType, mamba_ssm_cache_dtype: MambaDType, ) -> tuple[torch.dtype, ...]: - # TODO (tdoublep) requires kernel changes - if mamba_cache_dtype == "float32" or mamba_ssm_cache_dtype == "float32": - raise ValueError("fp32 state for mamba1 is not yet supported") - else: - return MambaStateDtypeCalculator.mamba2_state_dtype( - model_dtype, mamba_cache_dtype, mamba_ssm_cache_dtype) + return cls._mamba_state_dtype(model_dtype, mamba_cache_dtype, + mamba_ssm_cache_dtype) @classmethod def mamba2_state_dtype( @@ -43,6 +39,16 @@ class MambaStateDtypeCalculator: model_dtype: Union[ModelDType, torch.dtype], mamba_cache_dtype: MambaDType, mamba_ssm_cache_dtype: MambaDType, + ) -> tuple[torch.dtype, ...]: + return cls._mamba_state_dtype(model_dtype, mamba_cache_dtype, + mamba_ssm_cache_dtype) + + @classmethod + def _mamba_state_dtype( + cls, + model_dtype: Union[ModelDType, torch.dtype], + mamba_cache_dtype: MambaDType, + mamba_ssm_cache_dtype: MambaDType, ) -> tuple[torch.dtype, ...]: conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) From 04d0c60770fef98808db1c3d3361ab1a74384726 Mon Sep 17 00:00:00 2001 From: damon Date: Tue, 2 Sep 2025 11:54:20 +0800 Subject: [PATCH 15/95] =?UTF-8?q?[Bugfix]=20Fix=20the=20issue=20that=20Bli?= =?UTF-8?q?p2ForConditionalGeneration'=20object=20has=E2=80=A6=20(#24028)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Dazhi Jiang --- vllm/model_executor/models/blip2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 2f2b880bb0e14..ed98a3008c567 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -560,8 +560,8 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP, self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) - def _create_image_input(self, - **kwargs: object) -> Optional[Blip2ImageInputs]: + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Blip2ImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_embeds = kwargs.pop("image_embeds", None) From d59c986444a701b39369453eff0a8ba324bd565f Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Tue, 2 Sep 2025 00:54:37 -0300 Subject: [PATCH 16/95] Remove runtime checks based on pooling params (#24051) Signed-off-by: Max de Bayser --- vllm/v1/worker/gpu_input_batch.py | 17 ++++++----------- vllm/v1/worker/gpu_model_runner.py | 20 ++++++++------------ 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index f4c2f45df5954..ef5a7e39a5b16 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -704,17 +704,12 @@ class InputBatch: logitsprocs=self.logitsprocs, ) - @property - def pooling_metadata(self) -> PoolingMetadata: - if len(self.pooling_params) == 0: - pooling_params = [] - else: - # Note, for now this assumes that all request in the batch - # are either sampling or pooling requests - assert len(self.req_ids) == len(self.pooling_params) - pooling_params = [ - self.pooling_params[req_id] for req_id in self.req_ids - ] + def get_pooling_params(self) -> list[PoolingParams]: + assert len(self.req_ids) == len(self.pooling_params) + return [self.pooling_params[req_id] for req_id in self.req_ids] + + def get_pooling_metadata(self) -> PoolingMetadata: + pooling_params = self.get_pooling_params() return PoolingMetadata( prompt_lens=torch.from_numpy( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 08e13ab887bf9..96dafd6add679 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -138,7 +138,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ cache_config.cache_dtype] - self.is_pooling_model = model_config.pooler_config is not None + self.is_pooling_model = (model_config.runner_type == 'pooling') self.is_multimodal_raw_input_only_model = ( model_config.is_multimodal_raw_input_only_model) @@ -332,17 +332,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _init_model_kwargs(self, num_tokens: int): model_kwargs = dict[str, Any]() - num_reqs = self.input_batch.num_reqs - num_pooling_reqs = len(self.input_batch.pooling_params) - - if num_pooling_reqs == 0: + if not self.is_pooling_model: return model_kwargs - # This does nontrivial work. - pooling_params = self.input_batch.pooling_metadata.pooling_params - - assert num_pooling_reqs == num_reqs + num_reqs = self.input_batch.num_reqs + pooling_params = self.input_batch.get_pooling_params() token_type_id_requests = dict[int, Any]() for i, param in enumerate(pooling_params): @@ -456,7 +451,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): else: generator = None - if pooling_params: + if self.is_pooling_model: + assert pooling_params is not None task = pooling_params.task assert task is not None, "You did not set `task` in the API" @@ -1437,7 +1433,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): " a batch must be pooling request" hidden_states = hidden_states[:num_scheduled_tokens] - pooling_metadata = self.input_batch.pooling_metadata + pooling_metadata = self.input_batch.get_pooling_metadata() pooling_metadata.build_pooling_cursor(num_scheduled_tokens_np.tolist(), device=hidden_states.device) seq_lens_cpu = self.seq_lens.cpu[:self.input_batch.num_reqs] @@ -1609,7 +1605,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): all_gather_group=get_tp_group()) logits = None else: - if self.input_batch.pooling_params: + if self.is_pooling_model: return self._pool(hidden_states, num_scheduled_tokens, num_scheduled_tokens_np, kv_connector_output) From 1fa1d6a9a0784855b420a75c2b42d6600d62cb41 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Mon, 1 Sep 2025 21:01:36 -0700 Subject: [PATCH 17/95] Migrate OvisImagePatchInputs to TensorSchema (#22024) Signed-off-by: Benji Beck --- vllm/model_executor/models/ovis.py | 41 +++++++++++++++--------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 41fd272397e64..f1bb18716b40d 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -19,7 +19,7 @@ """ PyTorch Ovis model.""" import math from collections.abc import Iterable, Mapping -from typing import Literal, Optional, TypedDict, Union +from typing import Annotated, Literal, Optional, Union import torch import torch.nn as nn @@ -49,6 +49,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.processors.ovis import OvisProcessor +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import merge_multimodal_embeddings @@ -201,25 +202,22 @@ class VisualTokenizer(torch.nn.Module): return tokens -class OvisImagePatchInputs(TypedDict): +class OvisImagePatchInputs(TensorSchema): + """ + Dimensions: + - batch_patches: Batch size * number of patches + - patch_size: patch_size_x * patch_size_y * num_channels + - patch_indicators: Batch size * (number of patches + 1) + - patches_per_image: List of number of total patches for each image + in the batch. + """ type: Literal["image_patches"] - flat_data: torch.Tensor - """ - Shape: - `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)` - """ - - indicator_tokens: torch.Tensor - """ - Shape: - `(batch_size * (num_patches + 1))` - """ - - patches_per_image: list[int] - """ - List of number of total patches for each image in the batch. - This is used to restore the first two dimensions of `flat_data`. - """ + flat_data: Annotated[torch.Tensor, + TensorShape("batch_patches", "patch_size")] + indicator_tokens: Annotated[torch.Tensor, TensorShape("patch_indicators")] + patches_per_image: Annotated[list[int], + TensorShape("num_patches_per_image")] + # This is used to restore the first two dimensions of `flat_data`. class VisualEmbedding(torch.nn.Embedding): @@ -458,9 +456,12 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP): raise ValueError("Incorrect type of indicator_tokens. " f"Got type: {type(pixel_values)}") + flat_data = flatten_bn(pixel_values, concat=True) + if flat_data.ndim >= 3: + flat_data = flat_data.flatten(start_dim=1) return OvisImagePatchInputs( type="image_patches", - flat_data=flatten_bn(flatten_bn(pixel_values), concat=True), + flat_data=flat_data, patches_per_image=[ x.shape[0] for x in flatten_bn(pixel_values) ], From 7be0cb8e9e48849060660cf153205b4de1c1c854 Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Tue, 2 Sep 2025 12:06:53 +0800 Subject: [PATCH 18/95] [XPU][Feature] fp8 online quantization support for XPU (#23148) Signed-off-by: Yan Ma Co-authored-by: Qiming Zhang --- vllm/_ipex_ops.py | 56 +++++- .../model_executor/layers/quantization/fp8.py | 25 +++ .../layers/quantization/ipex_quant.py | 159 +++++++++++++++++- vllm/platforms/xpu.py | 4 + 4 files changed, 242 insertions(+), 2 deletions(-) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 79e3e448cada3..19f6c4e3060ce 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional +from typing import Optional, Union import torch from vllm.logger import init_logger +from vllm.platforms import current_platform logger = init_logger(__name__) @@ -349,3 +350,56 @@ class ipex_ops: def swap_blocks(src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor) -> None: torch.xpu.swap_blocks(src, dst, block_mapping) # type: ignore + + @staticmethod + def scaled_fp8_quant( + input: torch.Tensor, + scale: Optional[torch.Tensor] = None, + num_token_padding: Optional[int] = None, + scale_ub: Optional[torch.Tensor] = None, + use_per_token_if_dynamic: bool = False, + output: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Quantize input tensor to FP8 and return quantized tensor and scale. + + This function is designed for both static and dynamic quantization: + If you provide the scale, it will use static scaling and if you omit + it, the scale will be determined dynamically. Currently, XPU platform + only supports dynamic quantization. The function also allows optional + padding of the output tensors for downstream kernels that will benefit + from padding. + + Args: + input: The input tensor to be quantized to FP8 + scale: Optional scaling factor for the FP8 quantization + scale_ub: Optional upper bound for scaling factor in dynamic + per token case + num_token_padding: If specified, pad the first dimension + of the output to at least this value. + use_per_token_if_dynamic: Whether to do per_tensor or per_token + in the dynamic quantization case. + + Returns: + tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and + scaling factor. + """ + # This code assumes batch_dim and num_tokens are flattened + assert (input.ndim == 2) + shape: Union[tuple[int, int], torch.Size] = input.shape + out_dtype: torch.dtype = current_platform.fp8_dtype() + if num_token_padding: + shape = (max(num_token_padding, input.shape[0]), shape[1]) + if output is None: + output = torch.empty(shape, device=input.device, dtype=out_dtype) + else: + assert num_token_padding is None, \ + "padding not supported if output passed in" + assert output.dtype == out_dtype + assert scale is None, "only dynamic fp8 quantization supported on XPU" + assert not use_per_token_if_dynamic, ( + "per token dynamic fp8 quantization not supported on XPU") + scale = torch.zeros(1, device=input.device, dtype=torch.float32) + torch.ops.torch_ipex.dynamic_scaled_fp8_quant(output, input, scale) + + return output, scale diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 48bac8697e466..d9e01dcf40d5a 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -137,10 +137,35 @@ class Fp8Config(QuantizationConfig): ignored_layers=ignored_layers, weight_block_size=weight_block_size) + def get_xpu_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention + from vllm.model_executor.layers.quantization.ipex_quant import ( + XPUFp8LinearMethod, XPUFp8MoEMethod) + fp8_config = Fp8Config( + is_checkpoint_fp8_serialized=self.is_checkpoint_fp8_serialized, + activation_scheme=self.activation_scheme, + ignored_layers=self.ignored_layers, + weight_block_size=self.weight_block_size) + + if isinstance(layer, LinearBase): + if is_layer_skipped(prefix=prefix, + ignored_layers=self.ignored_layers, + fused_mapping=self.packed_modules_mapping): + return UnquantizedLinearMethod() + return XPUFp8LinearMethod(fp8_config) + elif isinstance(layer, FusedMoE): + return XPUFp8MoEMethod(fp8_config, layer) + elif isinstance(layer, Attention): + return Fp8KVCacheMethod(self) + return None + def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import + if current_platform.is_xpu(): + return self.get_xpu_quant_method(layer, prefix) if isinstance(layer, LinearBase): if is_layer_skipped(prefix=prefix, ignored_layers=self.ignored_layers, diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 9c458954f960f..5f9d4814274c8 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -1,11 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Any, Callable, Optional import torch from packaging import version +from torch.nn import Module +from torch.nn.parameter import Parameter +from vllm._ipex_ops import ipex_ops as ops +from vllm.model_executor.layers.fused_moe import (FusedMoEMethodBase, + FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -13,7 +18,10 @@ from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod, is_layer_skipped_awq) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.quantization.fp8 import (Fp8Config, + Fp8LinearMethod) from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod +from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform MIN_IPEX_VERSION = "2.6.0" @@ -251,3 +259,152 @@ class IPEXAWQLinearMethod(AWQLinearMethod): reshaped_x = x.reshape(-1, x.shape[-1]) out = layer.ipex_qlinear(reshaped_x) return out.reshape(x.shape[:-1] + (layer.ipex_output_size, )) + + +class XPUFp8LinearMethod(Fp8LinearMethod): + + def __init__(self, quant_config: Fp8Config): + super().__init__(quant_config) + + def process_weights_after_loading(self, layer: Module) -> None: + # If checkpoint not serialized fp8, quantize the weights. + if not self.quant_config.is_checkpoint_fp8_serialized: + qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, + scale=None) + # Update the layer with the new values. + layer.weight = Parameter(qweight, requires_grad=False) + layer.weight_scale = Parameter(weight_scale, requires_grad=False) + layer.input_scale = None + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + weight = layer.weight.data + weight_scale = layer.weight_scale.data + output = torch.ops.torch_ipex.fp8_gemm_w8a16(x, weight, True, + weight_scale, bias) + return output + + +class XPUFp8MoEMethod(FusedMoEMethodBase): + + def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): + super().__init__(layer.moe_config) + self.quant_config = quant_config + + def create_weights(self, layer: Module, num_experts: int, hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): + + layer.intermediate_size_per_partition = intermediate_size_per_partition + layer.hidden_size = hidden_size + layer.num_experts = num_experts + layer.orig_dtype = params_dtype + layer.weight_block_size = None + # WEIGHTS + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # Allocate 2 scales for w1 and w3 respectively. + # They will be combined to a single scale after weight loading. + w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts, + 2, + dtype=torch.float32), + requires_grad=False) + w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}) + # INPUT_SCALES + layer.w13_input_scale = None + layer.w2_input_scale = None + + def process_weights_after_loading(self, layer: Module) -> None: + if not self.quant_config.is_checkpoint_fp8_serialized: + fp8_dtype = current_platform.fp8_dtype() + w13_weight = torch.empty_like(layer.w13_weight.data, + dtype=fp8_dtype) + w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype) + + # Re-initialize w13_scale because we directly quantize + # merged w13 weights and generate a single scaling factor. + layer.w13_weight_scale = torch.nn.Parameter(torch.ones( + layer.local_num_experts, + dtype=torch.float32, + device=w13_weight.device), + requires_grad=False) + for expert in range(layer.local_num_experts): + w13_weight[expert, :, :], layer.w13_weight_scale[ + expert] = ops.scaled_fp8_quant( + layer.w13_weight.data[expert, :, :]) + w2_weight[expert, :, :], layer.w2_weight_scale[ + expert] = ops.scaled_fp8_quant( + layer.w2_weight.data[expert, :, :]) + layer.w13_weight = torch.nn.Parameter(w13_weight, + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(w2_weight, + requires_grad=False) + import intel_extension_for_pytorch as ipex + layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE( + layer.w13_weight, + layer.w2_weight, + w1_scale_inv=layer.w13_weight_scale, + w2_scale_inv=layer.w2_weight_scale, + a1_scale_inv=layer.w13_input_scale, + a2_scale_inv=layer.w2_input_scale, + use_prepack=True, + ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + return layer.ipex_fusion( + x, + use_grouped_topk, + top_k, + router_logits, + renormalize, + topk_group, + num_expert_group, + custom_routing_function=custom_routing_function, + ) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 84f4cd7256465..d61b921e19cfe 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -148,6 +148,10 @@ class XPUPlatform(Platform): torch.xpu.reset_peak_memory_stats(device) return torch.xpu.max_memory_allocated(device) + @classmethod + def fp8_dtype(cls) -> torch.dtype: + return torch.float8_e5m2 + @classmethod def is_data_center_gpu(cls) -> bool: device_name = cls.get_device_name().lower() From 56d04089ef508003c684c90429046d90f2117547 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Mon, 1 Sep 2025 21:35:45 -0700 Subject: [PATCH 19/95] Migrate Interns1 inputs to TensorSchema (#23510) Signed-off-by: Benji Beck --- vllm/model_executor/models/interns1.py | 101 ++++++++++++------------- 1 file changed, 50 insertions(+), 51 deletions(-) diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index c739e74b058fa..26e358f9394c6 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -7,7 +7,7 @@ # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, TypedDict, Union +from typing import Annotated, Literal, Optional, Union import regex as re import torch @@ -32,6 +32,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) @@ -62,51 +63,60 @@ class InternS1MultiModalProjector(nn.Module): return hidden_states -class InternS1ImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - pixel_values: torch.Tensor +class InternS1ImagePixelInputs(TensorSchema): """ - Shape: - `(batch_size * num_images * (1 + num_patches), num_channels, height, width)` + Dimensions: + - bnp: Batch size * number of images * (1 + num_patches) + - c: Number of channels (3) + - h: Height + - w: Width + - bn: Batch size * number of images """ + type: Literal["pixel_values"] = "pixel_values" + pixel_values: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")] + num_patches: Annotated[torch.Tensor, TensorShape("bn")] -class InternS1ImageEmbeddingInputs(TypedDict): - type: Literal["image_embeds"] - data: Union[torch.Tensor, list[torch.Tensor]] +class InternS1ImageEmbeddingInputs(TensorSchema): """ - A tensor of shape `(num_images, total_image_feature_size, hidden_size)` - or a list of tensors of shape `(total_image_feature_size, hidden_size)` - - `hidden_size` must match the hidden size of language model backbone. + Dimensions: + - ni: Number of images + - tifs: Total image feature size + - hs: Hidden size (must match language model backbone) """ + type: Literal["image_embeds"] = "image_embeds" + data: Annotated[Union[torch.Tensor, list[torch.Tensor]], + TensorShape("ni", "tifs", "hs")] InternS1ImageInputs = Union[InternS1ImagePixelInputs, InternS1ImageEmbeddingInputs] -class InternS1VideoPixelInputs(TypedDict): - type: Literal["pixel_values_videos"] - pixel_values: torch.Tensor +class InternS1VideoPixelInputs(TensorSchema): """ - Shape: - `(batch_size * num_video * num_frames, num_channels, height, width)` + Dimensions: + - bnv: Batch size * number of videos * number of frames + - bn: Batch size * number of images + - c: Number of channels (3) + - h: Height + - w: Width """ - - num_patches: torch.Tensor - """Shape: `(batch_size * num_images)`""" + type: Literal["pixel_values_videos"] = "pixel_values_videos" + pixel_values: Annotated[torch.Tensor, TensorShape("bnv", 3, "h", "w")] + num_patches: Annotated[torch.Tensor, TensorShape("bn")] -class InternS1VideoEmbeddingInputs(TypedDict): - type: Literal["video_embeds"] - data: Union[torch.Tensor, list[torch.Tensor]] +class InternS1VideoEmbeddingInputs(TensorSchema): """ - A tensor of shape `(num_videos, total_video_feature_size, hidden_size)` - or a list of tensors of shape `(total_video_feature_size, hidden_size)` - - `hidden_size` must match the hidden size of language model backbone. + Dimensions: + - nv: Number of videos + - tvfs: Total video feature size + - hs: Hidden size (must match language model backbone) """ + type: Literal["video_embeds"] = "video_embeds" + data: Annotated[Union[torch.Tensor, list[torch.Tensor]], + TensorShape("nv", "tvfs", "hs")] InternS1VideoInputs = Union[InternS1VideoPixelInputs, @@ -572,26 +582,6 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, vit_embeds = self.multi_modal_projector(vit_embeds) return vit_embeds - def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: - - h, w = self.config.vision_config.image_size - expected_dims = (3, h, w) - - def _validate_shape(d: torch.Tensor): - actual_dims = tuple(d.shape) - - if actual_dims != expected_dims: - expected_expr = str(expected_dims) - raise ValueError( - "The expected shape of pixel values per image per batch " - f" per patch is {expected_expr}. " - f"You supplied {tuple(d.shape)}.") - - for d in data: - _validate_shape(d) - - return data - def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[InternS1ImageInputs]: pixel_values = kwargs.pop("pixel_values", None) @@ -627,10 +617,15 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, pixel_values = flatten_bn(pixel_values, concat=True) image_num_patches = flatten_bn(image_num_patches, concat=True) + h, w = self.config.vision_config.image_size return InternS1ImagePixelInputs( type="pixel_values", - pixel_values=self._validate_pixel_values(pixel_values), + pixel_values=pixel_values, num_patches=image_num_patches, + resolve_bindings={ + "h": h, + "w": w, + }, ) raise AssertionError("This line should be unreachable.") @@ -671,11 +666,15 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, concat=True) video_num_patches = flatten_bn(video_num_patches, concat=True) + h, w = self.config.vision_config.image_size return InternS1VideoPixelInputs( type="pixel_values_videos", - pixel_values=self._validate_pixel_values( - pixel_values_flat_video), num_patches=video_num_patches, + pixel_values=pixel_values_flat_video, + resolve_bindings={ + "h": h, + "w": w, + }, ) raise AssertionError("This line should be unreachable.") From fad73be1a54391efc5b974c86a077b421b58dace Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Tue, 2 Sep 2025 11:38:55 +0200 Subject: [PATCH 20/95] [Doc]: fix typos in Python comments (#24077) Signed-off-by: Didier Durand --- tests/async_engine/test_api_server.py | 2 +- tests/core/block/e2e/test_correctness.py | 4 ++-- tests/engine/test_arg_utils.py | 2 +- tests/kernels/moe/test_deepep_deepgemm_moe.py | 2 +- tests/lora/test_add_lora.py | 4 ++-- tests/lora/test_lora_allowed_token_ids.py | 4 ++-- tests/models/language/generation/test_common.py | 2 +- tests/models/language/generation/test_mistral.py | 4 ++-- tests/models/multimodal/generation/test_qwen2_vl.py | 4 ++-- tests/v1/core/test_kv_cache_utils.py | 2 +- tests/v1/executor/test_executor.py | 2 +- tests/v1/spec_decode/test_eagle.py | 2 +- tests/v1/test_kv_sharing.py | 2 +- tests/v1/worker/test_gpu_model_runner.py | 2 +- 14 files changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 76c94bdf80ca8..90f63e7ea17db 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -98,7 +98,7 @@ def test_api_server(api_server, distributed_executor_backend: str): pool.join() # check cancellation stats - # give it some times to update the stats + # give it some time to update the stats time.sleep(1) num_aborted_requests = requests.get( diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 93222b564ebe7..8de48ef59a013 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -439,10 +439,10 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator, @pytest.mark.parametrize("seed", [1]) def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator, test_llm_generator): - """Verify block manager v2 with auto prefix caching could works normal + """Verify block manager v2 with auto prefix caching could work normally even when eviction started. With APC enabled, all blocks are held by native block at the beginning. - Then blocks are managed by evictor instead. If cache hit at the evitor's + Then blocks are managed by evictor instead. If cache hit at the evictor's block, then it could be reused, or we need to recompute its kv cache. """ output_len = 10 diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 93ac18dfcc7b4..ba8e31a79feb5 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -167,7 +167,7 @@ def test_get_kwargs(): # dict should have json tip in help json_tip = "Should either be a valid JSON string or JSON keys" assert json_tip in kwargs["json_tip"]["help"] - # nested config should should construct the nested config + # nested config should construct the nested config assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2) diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 36a98522a6588..6558cab6a9eff 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -282,7 +282,7 @@ def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor, a1_scale=a1_scale, block_shape=block_shape, # Make sure this is set to False so we - # dont end up comparing the same implementation. + # don't end up comparing the same implementation. allow_deep_gemm=False) diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index 44755c603f281..35d0245759154 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -59,10 +59,10 @@ async def requests_processing_time(llm, @pytest.mark.asyncio async def test_add_lora(chatglm3_lora_files): """ - The add_lora function is used to pre-load some LoRA adapters into the + The add_lora function is used to preload some LoRA adapters into the engine in anticipation of future requests using these adapters. To test this functionality, we use the async engine to process some requests - We - do it twice, once with add_lora() pre-loading and once without. + do it twice, once with add_lora() preloading and once without. We measure the request processing time in both cases and expect the time to be lesser in the case with add_lora() calls. diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py index 01bc102bd112b..e77eae70445db 100644 --- a/tests/lora/test_lora_allowed_token_ids.py +++ b/tests/lora/test_lora_allowed_token_ids.py @@ -18,7 +18,7 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id, adapters that define additional tokens. """ - # Setup a base model compatible with the sql_lora_files adapter and + # Set up a base model compatible with the sql_lora_files adapter and # a known number of tokens in the base model. model_config = ModelConfig( model=llama_2_7b_base_huggingface_id, @@ -84,7 +84,7 @@ def test_allowed_token_ids_with_lora_adapter_no_vocab( adapters that do not define additional tokens. """ - # Setup a base model compatible with the qwen25vl_lora_files adapter and + # Set up a base model compatible with the qwen25vl_lora_files adapter and # a known number of tokens in the base model. model_config = ModelConfig( model=qwen25vl_base_huggingface_id, diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 4c4434c94145a..8a04946b2ffb3 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -13,7 +13,7 @@ from ...registry import HF_EXAMPLE_MODELS from ...utils import check_logprobs_close # These have unsupported head_dim for FA. We do not -# not have a clean way to fall back, so we fail with +# have a clean way to fall back, so we fail with # a clear msg when it happens. # https://github.com/vllm-project/vllm/issues/14524 REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"] diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index af51a60edfd62..845afbfa8a45e 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -20,7 +20,7 @@ MISTRAL_FORMAT_MODELS = [ "mistralai/Mistral-7B-Instruct-v0.3", # uses the v3-Tekken tokenizer "mistralai/Ministral-8B-Instruct-2410", - # Mistral-Nemo is to big for CI, but passes locally + # Mistral-Nemo is too big for CI, but passes locally # "mistralai/Mistral-Nemo-Instruct-2407" ] @@ -273,7 +273,7 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None: def test_mistral_function_call_nested_json(): - """Ensure that the function-name regex captures the entire outer-most + """Ensure that the function-name regex captures the entire outermost JSON block, including nested braces.""" # Create a minimal stub tokenizer that provides the few attributes the diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index c61c27ae204a3..a81f5e7ec8872 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -154,7 +154,7 @@ def batch_make_image_embeddings( embed_counter += cur_batch_embed_len image_counter += cur_batch_image_count - # ensure we don't lost any images or embeddings + # ensure we don't lose any images or embeddings assert embed_counter == image_embeds.size(0) assert image_counter == image_grid_thw.size(0) assert len(image_batches) == len(result) @@ -238,7 +238,7 @@ def batch_make_video_embeddings( embed_counter += cur_batch_embed_len video_counter += cur_batch_video_count - # ensure we don't lost any videos or embeddings + # ensure we don't lose any videos or embeddings assert embed_counter == video_embeds.size(0) assert video_counter == video_grid_thw.size(0) assert len(video_batches) == len(result) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index e738f2bd46472..4d0a26f76e98e 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -247,7 +247,7 @@ def test_free_kv_cache_block_queue_append_n(): def test_free_kv_cache_block_queue_popleft_n(): blocks = [KVCacheBlock(block_id=i) for i in range(6)] - # Create a empty FreeKVCacheBlockQueue with these blocks + # Create an empty FreeKVCacheBlockQueue with these blocks queue = FreeKVCacheBlockQueue( [blocks[1], blocks[3], blocks[5], blocks[4], blocks[0], blocks[2]]) assert queue.num_free_blocks == 6 diff --git a/tests/v1/executor/test_executor.py b/tests/v1/executor/test_executor.py index bdd5155c1481d..4e83e2f9d4b63 100644 --- a/tests/v1/executor/test_executor.py +++ b/tests/v1/executor/test_executor.py @@ -27,7 +27,7 @@ class CustomMultiprocExecutor(MultiprocExecutor): kwargs: Optional[dict] = None, non_block: bool = False, unique_reply_rank: Optional[int] = None) -> list[Any]: - # Drop marker to show that this was ran + # Drop marker to show that this was run with open(".marker", "w"): ... return super().collective_rpc(method, timeout, args, kwargs) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 7b8445a0b2878..46e3a611c6d26 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -183,7 +183,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method, mock_pp_group.world_size = pp_size mock_get_pp_group.return_value = mock_pp_group - # Setup the target model mock with a custom class so that + # Set up the target model mock with a custom class so that # isinstance() checks match the expected type. class _TargetModelStub(LlamaForCausalLM): model: mock.MagicMock diff --git a/tests/v1/test_kv_sharing.py b/tests/v1/test_kv_sharing.py index 6b01b7d3e1d6c..96848047145b6 100644 --- a/tests/v1/test_kv_sharing.py +++ b/tests/v1/test_kv_sharing.py @@ -30,7 +30,7 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups(): } # Layers 0 and 1 both belong in KV cache group 0 - # However, if they have have different attention backends, they will be + # However, if they have different attention backends, they will be # placed in different attention groups for KV cache group 0 kv_cache_groups = [ KVCacheGroupSpec(["model.layers.0", "model.layers.1"], diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index d6cd03fb01a73..6d99029e404ef 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -702,7 +702,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): KVCacheTensors for the attention and mamba layers (via _reshape_kv_cache_tensors function). This test verifies that the views are compatible: writing a mamba block - will not corrupt an attention block and vice-versa + will not corrupt an attention block and vice versa ''' current_platform.seed_everything(42) From 2f0bab3f26678ebaacdbdd61e0bec581fa7e30b9 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Tue, 2 Sep 2025 18:48:18 +0800 Subject: [PATCH 21/95] [Model] Support dp on ViT on GLM-4.5V (#23168) Signed-off-by: David Chen <530634352@qq.com> --- docs/configuration/optimization.md | 1 + vllm/model_executor/models/glm4_1v.py | 203 ++++++++++++++++++-------- 2 files changed, 145 insertions(+), 59 deletions(-) diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index b0ea9621d545a..0ab2ae58ad861 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -174,6 +174,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u Known supported models: +- GLM-4.5V GLM-4.1V () - Kimi-VL () - Llama4 () - MiniCPM-V-2.5 or above (, ) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 662728e6b1393..f9fd5163d66b4 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -45,15 +45,20 @@ from transformers.models.glm4v.video_processing_glm4v import ( from transformers.video_utils import VideoMetadata from vllm.config import VllmConfig -from vllm.distributed import parallel_state +from vllm.distributed import (get_tensor_model_parallel_world_size, + parallel_state) from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.layernorm import RMSNorm +# yapf: disable from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, + MergedReplicatedLinear, QKVParallelLinear, + ReplicatedLinear, RowParallelLinear) +# yapf: enable from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -66,6 +71,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -153,7 +159,7 @@ class Glm4vVideoEmbeddingInputs(TensorSchema): Glm4vVideoInputs = Union[Glm4vVideoPixelInputs, Glm4vVideoEmbeddingInputs] -# === Vision Encoder === # +# ==== Vision Encoder ==== # class Glm4vVisionMLP(nn.Module): @@ -165,19 +171,23 @@ class Glm4vVisionMLP(nn.Module): bias: bool = False, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - input_size=in_features, - output_sizes=[hidden_features] * 2, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") - self.down_proj = RowParallelLinear(hidden_features, - in_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.down_proj") + cls_gate_up = (MergedReplicatedLinear + if use_data_parallel else MergedColumnParallelLinear) + self.gate_up_proj = cls_gate_up(input_size=in_features, + output_sizes=[hidden_features] * 2, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + cls_down = (ReplicatedLinear + if use_data_parallel else RowParallelLinear) + self.down_proj = cls_down(hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") self.act_fn = SiluAndMul() def forward(self, x: torch.Tensor): @@ -218,33 +228,54 @@ class Glm4vVisionAttention(nn.Module): projection_size: int, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() # Per attention head and per partition values. - self.tp_size = parallel_state.get_tensor_model_parallel_world_size() + self.tp_size = (1 if use_data_parallel else + get_tensor_model_parallel_world_size()) self.tp_rank = parallel_state.get_tensor_model_parallel_rank() self.hidden_size_per_attention_head = dist_utils.divide( projection_size, num_heads) self.num_attention_heads_per_partition = dist_utils.divide( num_heads, self.tp_size) - self.qkv = QKVParallelLinear( - hidden_size=embed_dim, - head_size=self.hidden_size_per_attention_head, - total_num_heads=num_heads, - total_num_kv_heads=num_heads, - bias=False, - quant_config=quant_config, - # Change qkv prefix to align with GLM-4.5V-FP8 quantization config - prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv", - ) - self.proj = RowParallelLinear( - input_size=projection_size, - output_size=embed_dim, - quant_config=quant_config, - prefix=f"{prefix}.proj", - bias=False, - ) + if use_data_parallel: + self.qkv = ReplicatedLinear( + input_size=embed_dim, + output_size=3 * projection_size, + bias=False, + quant_config=quant_config, + # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg + prefix=f"{prefix}.qkv_proj" + if quant_config else f"{prefix}.qkv", + ) + self.proj = ReplicatedLinear( + input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj", + bias=False, + ) + else: + self.qkv = QKVParallelLinear( + hidden_size=embed_dim, + head_size=self.hidden_size_per_attention_head, + total_num_heads=num_heads, + total_num_kv_heads=num_heads, + bias=False, + quant_config=quant_config, + # Change qkv prefix to align with GLM-4.5V-FP8 quantization cfg + prefix=f"{prefix}.qkv_proj" + if quant_config else f"{prefix}.qkv", + ) + self.proj = RowParallelLinear( + input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.proj", + bias=False, + ) # Detect attention implementation. self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) @@ -375,6 +406,7 @@ class Glm4vVisionBlock(nn.Module): norm_layer: Optional[Callable[[int], nn.Module]] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() if norm_layer is None: @@ -387,6 +419,7 @@ class Glm4vVisionBlock(nn.Module): projection_size=dim, quant_config=quant_config, prefix=f"{prefix}.attn", + use_data_parallel=use_data_parallel, ) self.mlp = Glm4vVisionMLP( dim, @@ -394,6 +427,7 @@ class Glm4vVisionBlock(nn.Module): bias=False, quant_config=quant_config, prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel, ) def forward( @@ -456,24 +490,40 @@ class Glm4vPatchMerger(nn.Module): quant_config: Optional[QuantizationConfig] = None, bias: bool = False, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() self.hidden_size = d_model - self.proj = ColumnParallelLinear(self.hidden_size, - self.hidden_size, - bias=bias, - gather_output=True, - quant_config=quant_config, - prefix=f"{prefix}.proj") + if use_data_parallel: + self.proj = ReplicatedLinear( + input_size=self.hidden_size, + output_size=self.hidden_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.proj", + ) + else: + self.proj = ColumnParallelLinear( + self.hidden_size, + self.hidden_size, + bias=bias, + gather_output=True, + quant_config=quant_config, + prefix=f"{prefix}.proj", + ) self.post_projection_norm = nn.LayerNorm(self.hidden_size) - self.gate_up_proj = MergedColumnParallelLinear( + cls_gate_up = (MergedReplicatedLinear + if use_data_parallel else MergedColumnParallelLinear) + self.gate_up_proj = cls_gate_up( input_size=self.hidden_size, output_sizes=[context_dim] * 2, bias=bias, quant_config=quant_config, prefix=f"{prefix}.gate_up_proj", ) - self.down_proj = RowParallelLinear( + cls_down = (ReplicatedLinear + if use_data_parallel else RowParallelLinear) + self.down_proj = cls_down( context_dim, self.hidden_size, bias=bias, @@ -548,14 +598,33 @@ class Glm4vVisionEmbeddings(nn.Module): dtype=torch.float32)) # Calculate target dimensions for each patch - target_h = torch.cat([ - image_shapes[i, 1].repeat(lengths[i]) - for i in range(len(lengths)) - ]).to(device=device, dtype=torch.float32) - target_w = torch.cat([ - image_shapes[i, 2].repeat(lengths[i]) - for i in range(len(lengths)) - ]).to(device=device, dtype=torch.float32) + # Add bounds checking for data parallel mode + if len(lengths) > image_shapes.shape[0]: + # In data parallel mode, some GPUs might not have all + # image shapes + # Use available image shapes, cycling if necessary + target_h_list = [] + target_w_list = [] + for i in range(len(lengths)): + # Cycle through available shapes + shape_idx = i % image_shapes.shape[0] + target_h_list.append(image_shapes[shape_idx, + 1].repeat(lengths[i])) + target_w_list.append(image_shapes[shape_idx, + 2].repeat(lengths[i])) + target_h = torch.cat(target_h_list).to(device=device, + dtype=torch.float32) + target_w = torch.cat(target_w_list).to(device=device, + dtype=torch.float32) + else: + target_h = torch.cat([ + image_shapes[i, 1].repeat(lengths[i]) + for i in range(len(lengths)) + ]).to(device=device, dtype=torch.float32) + target_w = torch.cat([ + image_shapes[i, 2].repeat(lengths[i]) + for i in range(len(lengths)) + ]).to(device=device, dtype=torch.float32) # Normalize coordinates to [-1, 1] range for grid_sample h_coords = h_coords.to(device=device, dtype=torch.float32) @@ -629,6 +698,7 @@ class Glm4vVisionTransformer(nn.Module): norm_eps: float = 1e-6, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ) -> None: super().__init__() @@ -638,6 +708,7 @@ class Glm4vVisionTransformer(nn.Module): depth = vision_config.depth self.hidden_size = vision_config.hidden_size self.num_heads = vision_config.num_heads + self.use_data_parallel = use_data_parallel self.patch_size = vision_config.patch_size self.spatial_merge_size = vision_config.spatial_merge_size @@ -661,6 +732,7 @@ class Glm4vVisionTransformer(nn.Module): norm_layer=norm_layer, quant_config=quant_config, prefix=f"{prefix}.blocks.{layer_idx}", + use_data_parallel=self.use_data_parallel, ) for layer_idx in range(depth) ]) self.merger = Glm4vPatchMerger( @@ -669,6 +741,7 @@ class Glm4vVisionTransformer(nn.Module): quant_config=quant_config, bias=False, prefix=f"{prefix}.merger", + use_data_parallel=self.use_data_parallel, ) self.embeddings = Glm4vVisionEmbeddings(vision_config) @@ -731,8 +804,11 @@ class Glm4vVisionTransformer(nn.Module): def forward( self, x: torch.Tensor, - grid_thw: torch.Tensor, + grid_thw: list[list[int]], ) -> torch.Tensor: + # Convert grid_thw to tensor (always expecting list format now) + grid_thw = torch.tensor(grid_thw, device=x.device, dtype=torch.long) + # patchify x = x.to(device=self.device, dtype=self.dtype) x = self.patch_embed(x) @@ -1250,6 +1326,8 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, "model.visual.": "visual.", }) + supports_encoder_tp_data = True + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): @@ -1267,12 +1345,14 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, self.config = config self.multimodal_config = multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.visual = Glm4vVisionTransformer( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-5), quant_config=quant_config, prefix=maybe_prefix(prefix, "visual"), + use_data_parallel=self.use_data_parallel, ) if config.model_type == "glm4v": @@ -1382,8 +1462,14 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) - + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values, + grid_thw.tolist(), + rope_type="rope_3d") + else: + image_embeds = self.visual(pixel_values, + grid_thw=grid_thw.tolist()) merge_size = self.visual.spatial_merge_size sizes = grid_thw.prod(-1) // merge_size // merge_size return image_embeds.split(sizes.tolist()) @@ -1393,23 +1479,22 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal, grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 - device = self.visual.device - flat_grid_thw = torch.cat([ - torch.tensor([[1, h, w]] * t, device=device) - for t, h, w in grid_thw - ]) if video_input["type"] == "video_embeds": video_embeds = video_input["video_embeds"].type(self.visual.dtype) else: pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, - grid_thw=flat_grid_thw) - + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model(self.visual, + pixel_values_videos, + grid_thw.tolist(), + rope_type="rope_3d") + else: + video_embeds = self.visual(pixel_values_videos, + grid_thw=grid_thw.tolist()) # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size sizes = grid_thw.prod(-1) // merge_size // merge_size - return video_embeds.split(sizes.tolist()) def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: From ce30dca5c44353f278dc114bd6f03b11700088eb Mon Sep 17 00:00:00 2001 From: Aziz Date: Tue, 2 Sep 2025 12:49:32 +0200 Subject: [PATCH 22/95] [CI]: reduce HTTP calls inside entrypoints openai tests (#23646) Signed-off-by: AzizCode92 Signed-off-by: Aziz Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/entrypoints/conftest.py | 29 +++++++++++++++++++ tests/entrypoints/openai/test_chat.py | 2 -- tests/entrypoints/openai/test_completion.py | 26 ----------------- .../test_completion_with_prompt_embeds.py | 27 +---------------- .../entrypoints/openai/test_lora_adapters.py | 8 ----- tests/entrypoints/openai/test_models.py | 8 ----- .../openai/test_return_tokens_as_ids.py | 2 -- tests/entrypoints/openai/test_tokenization.py | 2 -- 8 files changed, 30 insertions(+), 74 deletions(-) diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index a7c533ec24198..48fd848e88200 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -201,3 +201,32 @@ table: "table_1" | "table_2" condition: column "=" number number: "1" | "2" """) + + +@pytest.fixture(scope="session") +def zephyr_lora_files(): + """Download zephyr LoRA files once per test session.""" + from huggingface_hub import snapshot_download + return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora") + + +@pytest.fixture(scope="session") +def zephyr_lora_added_tokens_files(zephyr_lora_files): + """Create zephyr LoRA files with added tokens once per test session.""" + import shutil + from tempfile import TemporaryDirectory + + from transformers import AutoTokenizer + + tmp_dir = TemporaryDirectory() + tmp_model_dir = f"{tmp_dir.name}/zephyr" + shutil.copytree(zephyr_lora_files, tmp_model_dir) + tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") + # Copy tokenizer to adapter and add some unique tokens + # 32000, 32001, 32002 + added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"], + special_tokens=True) + assert added == 3 + tokenizer.save_pretrained(tmp_model_dir) + yield tmp_model_dir + tmp_dir.cleanup() diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 5ad29d70f10df..c9947c54a9181 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -15,8 +15,6 @@ import torch from openai import BadRequestError, OpenAI from ...utils import RemoteOpenAIServer -from .test_completion import zephyr_lora_added_tokens_files # noqa: F401 -from .test_completion import zephyr_lora_files # noqa: F401 # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 74ef6deeea16b..d55f8d9d65d9b 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -3,8 +3,6 @@ # imports for guided decoding tests import json import os -import shutil -from tempfile import TemporaryDirectory from typing import Optional import jsonschema @@ -14,9 +12,7 @@ import pytest_asyncio import regex as re import requests # downloading lora to test lora requests -from huggingface_hub import snapshot_download from openai import BadRequestError -from transformers import AutoTokenizer from vllm.transformers_utils.tokenizer import get_tokenizer @@ -26,32 +22,10 @@ from ...utils import RemoteOpenAIServer MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # technically these adapters use a different base model, # but we're not testing generation quality here -LORA_NAME = "typeof/zephyr-7b-beta-lora" GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"] -@pytest.fixture(scope="module") -def zephyr_lora_files(): - return snapshot_download(repo_id=LORA_NAME) - - -@pytest.fixture(scope="module") -def zephyr_lora_added_tokens_files(zephyr_lora_files): - tmp_dir = TemporaryDirectory() - tmp_model_dir = f"{tmp_dir.name}/zephyr" - shutil.copytree(zephyr_lora_files, tmp_model_dir) - tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) - # Copy tokenizer to adapter and add some unique tokens - # 32000, 32001, 32002 - added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"], - special_tokens=True) - assert added == 3 - tokenizer.save_pretrained(tmp_model_dir) - yield tmp_model_dir - tmp_dir.cleanup() - - @pytest.fixture(scope="module") def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files): return [ diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index 00d3ffb61ee9f..a0ef31762ea15 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -3,48 +3,23 @@ import base64 import io -import shutil -from tempfile import TemporaryDirectory import openai # use the official client for correctness check import pytest import pytest_asyncio import torch # downloading lora to test lora requests -from huggingface_hub import snapshot_download from openai import BadRequestError -from transformers import AutoConfig, AutoTokenizer +from transformers import AutoConfig from ...utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -LORA_NAME = "typeof/zephyr-7b-beta-lora" CONFIG = AutoConfig.from_pretrained(MODEL_NAME) -@pytest.fixture(scope="module") -def zephyr_lora_files(): - return snapshot_download(repo_id=LORA_NAME) - - -@pytest.fixture(scope="module") -def zephyr_lora_added_tokens_files(zephyr_lora_files): - tmp_dir = TemporaryDirectory() - tmp_model_dir = f"{tmp_dir.name}/zephyr" - shutil.copytree(zephyr_lora_files, tmp_model_dir) - tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) - # Copy tokenizer to adapter and add some unique tokens - # 32000, 32001, 32002 - added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"], - special_tokens=True) - assert added == 3 - tokenizer.save_pretrained(tmp_model_dir) - yield tmp_model_dir - tmp_dir.cleanup() - - @pytest.fixture(scope="module") def default_server_args( zephyr_lora_files, diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index bcdeaaacedea0..f91dcf194b839 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -9,8 +9,6 @@ from contextlib import suppress import openai # use the official client for correctness check import pytest import pytest_asyncio -# downloading lora to test lora requests -from huggingface_hub import snapshot_download from ...utils import RemoteOpenAIServer @@ -18,7 +16,6 @@ from ...utils import RemoteOpenAIServer MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # technically this needs Mistral-7B-v0.1 as base, but we're not testing # generation quality here -LORA_NAME = "typeof/zephyr-7b-beta-lora" BADREQUEST_CASES = [ ( @@ -48,11 +45,6 @@ BADREQUEST_CASES = [ ] -@pytest.fixture(scope="module") -def zephyr_lora_files(): - return snapshot_download(repo_id=LORA_NAME) - - @pytest.fixture(scope="module") def monkeypatch_module(): from _pytest.monkeypatch import MonkeyPatch diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py index 1980daa80db9e..7cd3ca196a431 100644 --- a/tests/entrypoints/openai/test_models.py +++ b/tests/entrypoints/openai/test_models.py @@ -4,8 +4,6 @@ import openai # use the official client for correctness check import pytest import pytest_asyncio -# downloading lora to test lora requests -from huggingface_hub import snapshot_download from ...utils import RemoteOpenAIServer @@ -13,12 +11,6 @@ from ...utils import RemoteOpenAIServer MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # technically this needs Mistral-7B-v0.1 as base, but we're not testing # generation quality here -LORA_NAME = "typeof/zephyr-7b-beta-lora" - - -@pytest.fixture(scope="module") -def zephyr_lora_files(): - return snapshot_download(repo_id=LORA_NAME) @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index af58fbd4b3640..5f43fdc9588f3 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -11,8 +11,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer from ...utils import RemoteOpenAIServer from .test_completion import default_server_args # noqa: F401 -from .test_completion import zephyr_lora_added_tokens_files # noqa: F401 -from .test_completion import zephyr_lora_files # noqa: F401 from .test_completion import MODEL_NAME diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 0dbbdfbfd24ad..72c8a3510c9b0 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -8,8 +8,6 @@ import requests from vllm.transformers_utils.tokenizer import get_tokenizer from ...utils import RemoteOpenAIServer -from .test_completion import zephyr_lora_added_tokens_files # noqa: F401 -from .test_completion import zephyr_lora_files # noqa: F401 # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" From 8bd5844989373d6914f6dea94a56822e6d7380d4 Mon Sep 17 00:00:00 2001 From: Christian Berge <42270330+cberge908@users.noreply.github.com> Date: Tue, 2 Sep 2025 14:04:59 +0200 Subject: [PATCH 23/95] correct LWS deployment yaml (#23104) Signed-off-by: cberge908 <42270330+cberge908@users.noreply.github.com> --- docs/deployment/frameworks/lws.md | 6 ++---- examples/online_serving/multi-node-serving.sh | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md index 3319dc6c90e1e..3b9fa3ea43d64 100644 --- a/docs/deployment/frameworks/lws.md +++ b/docs/deployment/frameworks/lws.md @@ -22,7 +22,7 @@ Deploy the following yaml file `lws.yaml` metadata: name: vllm spec: - replicas: 2 + replicas: 1 leaderWorkerTemplate: size: 2 restartPolicy: RecreateGroupOnPodRestart @@ -41,7 +41,7 @@ Deploy the following yaml file `lws.yaml` - sh - -c - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); - python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2" + vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2" resources: limits: nvidia.com/gpu: "8" @@ -126,8 +126,6 @@ Should get an output similar to this: NAME READY STATUS RESTARTS AGE vllm-0 1/1 Running 0 2s vllm-0-1 1/1 Running 0 2s -vllm-1 1/1 Running 0 2s -vllm-1-1 1/1 Running 0 2s ``` Verify that the distributed tensor-parallel inference works: diff --git a/examples/online_serving/multi-node-serving.sh b/examples/online_serving/multi-node-serving.sh index e8ad8d3de5f41..3fc5502fb9bc2 100644 --- a/examples/online_serving/multi-node-serving.sh +++ b/examples/online_serving/multi-node-serving.sh @@ -11,7 +11,7 @@ # Example usage: # On the head node machine, start the Ray head node process and run a vLLM server. # ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size= [] && \ -# python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2 +# vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2 # # On each worker node, start the Ray worker node process. # ./multi-node-serving.sh worker --ray_address= --ray_port=6379 [] From 0a74e9d0f2367cc121547aa8e21e13b04d4cad30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 2 Sep 2025 16:23:35 +0200 Subject: [PATCH 24/95] [Gemma3n] Fix audio batching (#24052) Signed-off-by: NickLucche --- ...i_chat_completion_client_for_multimodal.py | 42 +++++++++++++++++++ vllm/model_executor/models/gemma3n_mm.py | 28 +++++++++---- 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index ac5f79b56e49f..37216a5cfe574 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -266,10 +266,52 @@ def run_audio(model: str) -> None: print("Chat completion output from base64 encoded audio:", result) +def run_multi_audio(model: str) -> None: + from vllm.assets.audio import AudioAsset + + # Two different audios to showcase batched inference. + audio_url = AudioAsset("winning_call").url + audio_base64 = encode_base64_content_from_url(audio_url) + audio_url2 = AudioAsset("azacinto_foscolo").url + audio_base64_2 = encode_base64_content_from_url(audio_url2) + + # OpenAI-compatible schema (`input_audio`) + chat_completion_from_base64 = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "Are these two audios the same?"}, + { + "type": "input_audio", + "input_audio": { + "data": audio_base64, + "format": "wav", + }, + }, + { + "type": "input_audio", + "input_audio": { + "data": audio_base64_2, + "format": "wav", + }, + }, + ], + } + ], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_base64.choices[0].message.content + print("Chat completion output from input audio:", result) + + example_function_map = { "text-only": run_text_only, "single-image": run_single_image, "multi-image": run_multi_image, + "multi-audio": run_multi_audio, "video": run_video, "audio": run_audio, } diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index c25bbcd420c39..d831e9084db57 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -5,6 +5,7 @@ from typing import Any, Literal, Optional, TypedDict, Union, cast import numpy as np import torch +# yapf: disable from torch import nn from transformers import AutoModel, BatchFeature from transformers.models.gemma3n import (Gemma3nAudioConfig, @@ -30,7 +31,6 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems) from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems, MultiModalDataParser) -# yapf: disable from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, MultiModalPromptUpdates, @@ -62,7 +62,8 @@ class Gemma3nImagePixelInputs(TypedDict): class Gemma3nAudioInputs(TypedDict): - input_features: torch.Tensor + input_features: Union[torch.Tensor, list[torch.Tensor]] + input_features_padded: torch.Tensor """Shape: `(batch_size * num_audio, seq_length, num_features)`""" input_features_mask: torch.Tensor """Shape: `(batch_size * num_audio, seq_length)`""" @@ -188,8 +189,13 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo] mm_kwargs, tok_kwargs, ) + if 'input_features' in processed_outputs: - # Avoid padding since we need the output of each item to be + # Padding enables audio_tower to run in batched mode + processed_outputs["input_features_padded"] = \ + processed_outputs["input_features"] + + # Unpad features here since we need the output of each item to be # independent of other items for the cache to work correctly unpadded_features = [ f[mask] for f, mask in zip( @@ -206,9 +212,11 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo] hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - return dict(pixel_values=MultiModalFieldConfig.batched("image"), - input_features=MultiModalFieldConfig.batched("audio"), - input_features_mask=MultiModalFieldConfig.batched("audio")) + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + input_features=MultiModalFieldConfig.batched("audio"), + input_features_padded=MultiModalFieldConfig.batched("audio"), + input_features_mask=MultiModalFieldConfig.batched("audio")) def _get_prompt_updates( self, @@ -516,9 +524,14 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal, if input_features_mask is None: return None + input_features_padded = kwargs.pop("input_features_padded", None) + if input_features_padded is None: + return None + return Gemma3nAudioInputs( input_features=input_features, input_features_mask=input_features_mask, + input_features_padded=input_features_padded, ) def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: @@ -564,7 +577,8 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal, audio_input: Gemma3nAudioInputs, ) -> list[torch.Tensor]: assert self.audio_tower is not None - input_features = audio_input["input_features"].squeeze(1) + # Run on padded features to enable batching + input_features = audio_input["input_features_padded"].squeeze(1) input_features_mask = audio_input["input_features_mask"].squeeze(1) audio_outputs, audio_mask = self.audio_tower(input_features, ~input_features_mask) From 38ba061f6f441ac60fb31c68b12385ae00ff7614 Mon Sep 17 00:00:00 2001 From: Kyungmin Lee <30465912+lkm2835@users.noreply.github.com> Date: Tue, 2 Sep 2025 23:40:55 +0900 Subject: [PATCH 25/95] [BugFix] Fix EXAONE4 rotary embeddings (#23918) Signed-off-by: lkm2835 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/exaone4.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 971fcbd2aa275..e94c43a47f76a 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -164,8 +164,8 @@ class Exaone4Attention(nn.Module): is_sliding = config.layer_types[layer_idx] == "sliding_attention" self.sliding_window = config.sliding_window if is_sliding else None - # apply rotary embeddings to every layer - self.apply_all_layers = not is_sliding + # apply rotary embeddings to every layer in full attention models + self.apply_rope_all_layers = "sliding_attention" not in config.layer_types self.rotary_emb = get_rope( self.head_dim, @@ -201,7 +201,7 @@ class Exaone4Attention(nn.Module): k = self.k_norm(k) k = k.flatten(-2, -1) - if self.sliding_window or self.apply_all_layers: + if self.sliding_window or self.apply_rope_all_layers: q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) From e0653f6c0b9f331af0877e7c7abc99a85efc3982 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 3 Sep 2025 00:48:57 +0800 Subject: [PATCH 26/95] [Model] Classification models support logit_bias / sigmoid_normalize (#24031) Signed-off-by: wang.yuqi Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/config/__init__.py | 45 ++++++++++++++------------- vllm/model_executor/layers/pooler.py | 8 +++++ vllm/model_executor/models/config.py | 4 ++- vllm/model_executor/models/jina_vl.py | 11 ++----- 4 files changed, 38 insertions(+), 30 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index f53e8b0308853..2e0212d010da0 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2651,24 +2651,46 @@ class PoolerConfig: ## for embeddings models normalize: Optional[bool] = None """ - Whether to normalize the embeddings outputs. + Whether to normalize the embeddings outputs. Defaults to True. """ dimensions: Optional[int] = None """ Reduce the dimensions of embeddings if model - support matryoshka representation. + support matryoshka representation. Defaults to None. + """ + enable_chunked_processing: Optional[bool] = None + """ + Whether to enable chunked processing for long inputs that exceed the model's + maximum position embeddings. When enabled, long inputs will be split into + chunks, processed separately, and then aggregated using weighted averaging. + This allows embedding models to handle arbitrarily long text without CUDA + errors. Defaults to False. + """ + max_embed_len: Optional[int] = None + """ + Maximum input length allowed for embedding generation. When set, allows + inputs longer than max_embed_len to be accepted for embedding models. + When an input exceeds max_embed_len, it will be handled according to + the original max_model_len validation logic. + Defaults to None (i.e. set to max_model_len). """ ## for classification models activation: Optional[bool] = None """ Whether to apply activation function to the classification outputs. + Defaults to True. + """ + logit_bias: Optional[float] = None + """ + If provided, apply classification logit biases. Defaults to None. """ ## for reward models softmax: Optional[bool] = None """ Whether to apply softmax to the reward outputs. + Defaults to True. """ step_tag_id: Optional[int] = None """ @@ -2683,25 +2705,6 @@ class PoolerConfig: ``math-shepherd-mistral-7b-prm`` model. """ - enable_chunked_processing: Optional[bool] = None - """ - Whether to enable chunked processing for long inputs that exceed the model's - maximum position embeddings. When enabled, long inputs will be split into - chunks, processed separately, and then aggregated using weighted averaging. - This allows embedding models to handle arbitrarily long text without CUDA - errors. Defaults to False. - """ - - max_embed_len: Optional[int] = None - """ - Maximum input length allowed for embedding generation. When set, allows - inputs longer than max_embed_len to be accepted for embedding models. - This parameter enables accepting long inputs without requiring - VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds - max_embed_len, it will be handled according to the original max_model_len - validation logic. Defaults to None (i.e. set to max_model_len). - """ - def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 62b3ee1abaca8..afe7ea7b83924 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -633,9 +633,14 @@ class ClassifierPooler(Pooler): ) -> None: super().__init__() + from vllm.config import get_current_vllm_config + vllm_config = get_current_vllm_config() + self.pooling = pooling self.classifier = classifier self.act_fn = act_fn or PoolerClassify() + self.logit_bias: Optional[ + float] = vllm_config.model_config.pooler_config.logit_bias def get_supported_tasks(self) -> Set[PoolingTask]: return {"classify", "score"} @@ -654,6 +659,9 @@ class ClassifierPooler(Pooler): pooled_data = self.classifier(pooled_data) # pooled_data shape: [batchsize, num_labels] + if self.logit_bias is not None: + pooled_data -= self.logit_bias + pooling_params = get_pooling_params(pooling_metadata) flags = [p.activation for p in pooling_params] diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 377b7bf26a07a..0245e89f7da71 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -210,8 +210,10 @@ class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: config = vllm_config.model_config.hf_config - config.num_labels = 1 + pooler_config = vllm_config.model_config.pooler_config + if pooler_config.logit_bias is None: + pooler_config.logit_bias = 2.65 class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig): diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py index 8c64f636c6a0f..140b0d1674728 100644 --- a/vllm/model_executor/models/jina_vl.py +++ b/vllm/model_executor/models/jina_vl.py @@ -92,17 +92,14 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration, pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None - # logit bias for sigmoid normalization - self.LOGIT_BIAS = 2.65 - self.score = JinaVLScorer(config) self.pooler = DispatchPooler({ "encode": Pooler.for_encode(pooler_config), "classify": - Pooler.for_classify(pooler_config, classifier=None), + Pooler.for_classify(pooler_config, classifier=self.score), "score": - Pooler.for_classify(pooler_config, classifier=None), + Pooler.for_classify(pooler_config, classifier=self.score), }) @classmethod @@ -137,9 +134,7 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration, inputs_embeds=inputs_embeds, **kwargs, ) - - logits = self.score(hidden_states) - self.LOGIT_BIAS - return logits + return hidden_states def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) From e66ed3e675d9e3c8cbc128adf81b62162827daee Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 2 Sep 2025 13:18:15 -0400 Subject: [PATCH 27/95] [CI Failure] Skip failing nvfp4 silu test (#23959) Signed-off-by: mgoin Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py index 969f14cc3fe62..4325162ae94a9 100644 --- a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py +++ b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py @@ -8,7 +8,8 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.platforms import current_platform from vllm.scalar_type import scalar_types -if not current_platform.has_device_capability(100): +if not (current_platform.has_device_capability(100) + and hasattr(torch.ops._C, "silu_and_mul_nvfp4_quant")): pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.", allow_module_level=True) From 0e1759cd54f98e628d6660becb7f5626d084d68f Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Wed, 3 Sep 2025 01:27:20 +0800 Subject: [PATCH 28/95] [docs] add SYS_NICE cap & `security-opt` for docker/k8s (#24017) Signed-off-by: Peter Pan Signed-off-by: Peter Pan Co-authored-by: Li, Jiang Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/getting_started/installation/cpu.md | 32 +++++++++++++++++++ .../installation/cpu/arm.inc.md | 4 +++ .../installation/cpu/s390x.inc.md | 3 ++ .../installation/cpu/x86.inc.md | 1 + 4 files changed, 40 insertions(+) diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index 7f0ecb2bc0b74..ccb2909ea3fb6 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -194,3 +194,35 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel - Both of them require `amx` CPU flag. - `VLLM_CPU_MOE_PREPACK` can provides better performance for MoE models - `VLLM_CPU_SGL_KERNEL` can provides better performance for MoE models and small-batch scenarios. + +### Why do I see `get_mempolicy: Operation not permitted` when running in Docker? + +In some container environments (like Docker), NUMA-related syscalls used by vLLM (e.g., `get_mempolicy`, `migrate_pages`) are blocked/denied in the runtime's default seccomp/capabilities settings. This may lead to warnings like `get_mempolicy: Operation not permitted`. Functionality is not affected, but NUMA memory binding/migration optimizations may not take effect and performance can be suboptimal. + +To enable these optimizations inside Docker with the least privilege, you can follow below tips: + +```bash +docker run ... --cap-add SYS_NICE --security-opt seccomp=unconfined ... + +# 1) `--cap-add SYS_NICE` is to address `get_mempolicy` EPERM issue. + +# 2) `--security-opt seccomp=unconfined` is to enable `migrate_pages` for `numa_migrate_pages()`. +# Actually, `seccomp=unconfined` bypasses the seccomp for container, +# if it's unacceptable, you can customize your own seccomp profile, +# based on docker/runtime default.json and add `migrate_pages` to `SCMP_ACT_ALLOW` list. + +# reference : https://docs.docker.com/engine/security/seccomp/ +``` + +Alternatively, running with `--privileged=true` also works but is broader and not generally recommended. + +In K8S, the following configuration can be added to workload yaml to achieve the same effect as above: + +```yaml +securityContext: + seccompProfile: + type: Unconfined + capabilities: + add: + - SYS_NICE +``` diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md index cac578eefb1d7..e45baa0aa4938 100644 --- a/docs/getting_started/installation/cpu/arm.inc.md +++ b/docs/getting_started/installation/cpu/arm.inc.md @@ -48,6 +48,10 @@ docker run --rm \ --dtype=bfloat16 \ other vLLM OpenAI server arguments ``` + +!!! tip + An alternative of `--privileged=true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`. + # --8<-- [end:build-image-from-source] # --8<-- [start:extra-information] # --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md index c1917267ce91b..f9c4ccb942fac 100644 --- a/docs/getting_started/installation/cpu/s390x.inc.md +++ b/docs/getting_started/installation/cpu/s390x.inc.md @@ -89,6 +89,9 @@ docker run --rm \ other vLLM OpenAI server arguments ``` +!!! tip + An alternative of `--privileged true` is `--cap-add SYS_NICE --security-opt seccomp=unconfined`. + # --8<-- [end:build-image-from-source] # --8<-- [start:extra-information] # --8<-- [end:extra-information] diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md index f7af259ace628..836da33f65317 100644 --- a/docs/getting_started/installation/cpu/x86.inc.md +++ b/docs/getting_started/installation/cpu/x86.inc.md @@ -44,6 +44,7 @@ docker build -f docker/Dockerfile.cpu \ # Launching OpenAI server docker run --rm \ --security-opt seccomp=unconfined \ + --cap-add SYS_NICE \ --shm-size=4g \ -p 8000:8000 \ -e VLLM_CPU_KVCACHE_SPACE= \ From c83c4ff815f57f57194b99828368f5785ca4e1cc Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Wed, 3 Sep 2025 01:49:16 +0800 Subject: [PATCH 29/95] [Benchmark] Add support for local hf dataset path in benchmark (#23999) Signed-off-by: zjy0516 --- benchmarks/README.md | 7 +++- vllm/benchmarks/datasets.py | 64 +++++++++++++++++++++++++++++-------- 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 38072152b653b..98b3600d13635 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -110,7 +110,12 @@ become available. 🚧: to be supported -**Note**: HuggingFace dataset's `dataset-name` should be set to `hf` +**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`. +For local `dataset-path`, please set `hf-name` to its Hugging Face ID like + +```bash +--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat +``` ## 🚀 Example - Online Benchmark diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 93519b5ba1523..882b68ac9e2fd 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1227,6 +1227,16 @@ def add_dataset_parser(parser: FlexibleArgumentParser): type=str, default=None, help="Split of the HF dataset.") + hf_group.add_argument( + "--hf-name", + type=str, + default=None, + help=( + "Name of the dataset on HuggingFace " + "(e.g., 'lmarena-ai/VisionArena-Chat'). " + "Specify this if your dataset-path is a local path." + ), + ) hf_group.add_argument( "--hf-output-len", type=int, @@ -1307,28 +1317,53 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: elif args.dataset_name == "hf": # all following datasets are implemented from the # HuggingFaceDataset base class - if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: + if ( + args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS + ): dataset_class = VisionArenaDataset args.hf_split = "train" args.hf_subset = None - elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: + elif ( + args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS + ): dataset_class = InstructCoderDataset args.hf_split = "train" - elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS: + elif ( + args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS + ): dataset_class = MTBenchDataset args.hf_split = "train" - elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: + elif ( + args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS + ): dataset_class = ConversationDataset - elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: + elif ( + args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS + or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS + ): dataset_class = AIMODataset args.hf_split = "train" - elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501 + elif ( + args.dataset_path + in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS # noqa: E501 + or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS + ): dataset_class = NextEditPredictionDataset args.hf_split = "train" - elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS: + elif ( + args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS + ): dataset_class = ASRDataset args.hf_split = "train" - elif args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS: + elif ( + args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS + or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS + ): dataset_class = MLPerfDataset args.hf_split = "train" else: @@ -1358,6 +1393,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: dataset_split=args.hf_split, random_seed=args.seed, no_stream=args.no_stream, + hf_name=args.hf_name, ).sample( num_requests=args.num_prompts, tokenizer=tokenizer, @@ -1710,6 +1746,7 @@ class HuggingFaceDataset(BenchmarkDataset): dataset_split: str, no_stream: bool = False, dataset_subset: Optional[str] = None, + hf_name: Optional[str] = None, **kwargs, ) -> None: super().__init__(dataset_path=dataset_path, **kwargs) @@ -1717,6 +1754,7 @@ class HuggingFaceDataset(BenchmarkDataset): self.dataset_split = dataset_split self.dataset_subset = dataset_subset self.load_stream = not no_stream + self.hf_name = hf_name or dataset_path self.load_data() def load_data(self) -> None: @@ -1827,10 +1865,9 @@ class VisionArenaDataset(HuggingFaceDataset): for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break - parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path) + parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name) if parser_fn is None: - raise ValueError( - f"Unsupported dataset path: {self.dataset_path}") + raise ValueError(f"Unsupported dataset path: {self.hf_name}") prompt = parser_fn(item) mm_content = process_image(item["images"][0]) prompt_len = len(tokenizer(prompt).input_ids) @@ -2099,10 +2136,9 @@ class NextEditPredictionDataset(HuggingFaceDataset): def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, request_id_prefix: str = "", **kwargs): - formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get( - self.dataset_path) + formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name) if formatting_prompt_func is None: - raise ValueError(f"Unsupported dataset path: {self.dataset_path}") + raise ValueError(f"Unsupported dataset path: {self.hf_name}") samples = [] for i, sample in enumerate(self.data): sample = formatting_prompt_func(sample) From 1c4131058465db9966ffc3a701a86a479216d0ea Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Tue, 2 Sep 2025 13:54:10 -0400 Subject: [PATCH 30/95] [Bugfix] Fix transform_config parsing in Compressed Tensors (#23945) Signed-off-by: Kyle Sayers --- .../quantization/compressed_tensors/compressed_tensors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index b07bf675ca47d..97041a5a050f1 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -63,7 +63,7 @@ class CompressedTensorsConfig(QuantizationConfig): sparsity_ignore_list: list[str], kv_cache_scheme: Optional[dict[str, Any]] = None, config: Optional[dict[str, Any]] = None, - transform_config: Optional[TransformConfig] = None, + transform_config: Optional[dict[str, Any]] = None, ): super().__init__() self.ignore = ignore @@ -75,7 +75,7 @@ class CompressedTensorsConfig(QuantizationConfig): self.sparsity_ignore_list = sparsity_ignore_list self.config = config - if transform_config is not None: + if transform_config: self.transform_config = TransformConfig.model_validate( transform_config) else: From f399182e8c401303f7399b953417b926e4eb64f2 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Tue, 2 Sep 2025 10:55:32 -0700 Subject: [PATCH 31/95] Run ruff format on a few files. (#24075) Signed-off-by: Chenheli Hua --- tests/entrypoints/test_chat_utils.py | 1032 ++++++++++++--------- vllm/entrypoints/chat_utils.py | 378 +++++--- vllm/entrypoints/openai/serving_engine.py | 229 +++-- 3 files changed, 948 insertions(+), 691 deletions(-) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 647f1c7b7f34f..0c1f19371a160 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -46,23 +46,27 @@ MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" @pytest.fixture(scope="function") def phi3v_model_config(): - return ModelConfig(PHI3V_MODEL_ID, - runner="generate", - trust_remote_code=True, - limit_mm_per_prompt={ - "image": 2, - }) + return ModelConfig( + PHI3V_MODEL_ID, + runner="generate", + trust_remote_code=True, + limit_mm_per_prompt={ + "image": 2, + }, + ) @pytest.fixture(scope="function") def phi3v_model_config_mm_interleaved(): - return ModelConfig(PHI3V_MODEL_ID, - runner="generate", - trust_remote_code=True, - interleave_mm_strings=True, - limit_mm_per_prompt={ - "image": 2, - }) + return ModelConfig( + PHI3V_MODEL_ID, + runner="generate", + trust_remote_code=True, + interleave_mm_strings=True, + limit_mm_per_prompt={ + "image": 2, + }, + ) @pytest.fixture(scope="module") @@ -77,14 +81,16 @@ def phi3v_tokenizer(): @pytest.fixture(scope="function") def qwen25omni_model_config_mm_interleaved(): - return ModelConfig(QWEN25OMNI_MODEL_ID, - runner="generate", - interleave_mm_strings=True, - limit_mm_per_prompt={ - "image": 2, - "audio": 1, - "video": 1, - }) + return ModelConfig( + QWEN25OMNI_MODEL_ID, + runner="generate", + interleave_mm_strings=True, + limit_mm_per_prompt={ + "image": 2, + "audio": 1, + "video": 1, + }, + ) @pytest.fixture(scope="module") @@ -99,11 +105,13 @@ def qwen25omni_tokenizer(): @pytest.fixture(scope="module") def mllama_model_config(): - return ModelConfig(MLLAMA_MODEL_ID, - runner="generate", - limit_mm_per_prompt={ - "image": 2, - }) + return ModelConfig( + MLLAMA_MODEL_ID, + runner="generate", + limit_mm_per_prompt={ + "image": 2, + }, + ) @pytest.fixture(scope="module") @@ -118,11 +126,13 @@ def mllama_tokenizer(): @pytest.fixture(scope="function") def mistral_model_config(): - return ModelConfig(MISTRAL_MODEL_ID, - runner="generate", - limit_mm_per_prompt={ - "image": 2, - }) + return ModelConfig( + MISTRAL_MODEL_ID, + runner="generate", + limit_mm_per_prompt={ + "image": 2, + }, + ) @pytest.fixture(scope="module") @@ -137,21 +147,21 @@ def mistral_tokenizer(): @pytest.fixture(scope="module") def image_url(): - image = ImageAsset('cherry_blossom') + image = ImageAsset("cherry_blossom") base64 = encode_image_base64(image.pil_image) return f"data:image/jpeg;base64,{base64}" @pytest.fixture(scope="module") def video_url(): - video = VideoAsset('baby_reading', 1) + video = VideoAsset("baby_reading", 1) base64 = encode_video_base64(video.np_ndarrays) return f"data:video/jpeg;base64,{base64}" @pytest.fixture(scope="module") def audio_url(): - audio = AudioAsset('mary_had_lamb') + audio = AudioAsset("mary_had_lamb") base64 = encode_audio_base64(*audio.audio_and_sample_rate) return f"data:audio/ogg;base64,{base64}" @@ -195,15 +205,18 @@ def test_parse_chat_messages_single_image( [{ "role": "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in the image?" - }] + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], }], phi3v_model_config, phi3v_tokenizer, @@ -223,58 +236,69 @@ def test_parse_chat_messages_empty_system( ): # Test string format conversation, _ = parse_chat_messages( - [{ - "role": "system", - "content": "" - }, { - "role": "user", - "content": [{ - "type": "text", - "text": "Who are you?" - }] - }], + [ + { + "role": "system", + "content": "" + }, + { + "role": "user", + "content": [{ + "type": "text", + "text": "Who are you?" + }], + }, + ], mistral_model_config, mistral_tokenizer, content_format="string", ) - assert conversation == [{ - "role": "system", - "content": "" - }, { - "role": "user", - "content": "Who are you?" - }] + assert conversation == [ + { + "role": "system", + "content": "" + }, + { + "role": "user", + "content": "Who are you?" + }, + ] # Test openai format conversation, _ = parse_chat_messages( - [{ + [ + { + "role": "system", + "content": "" + }, + { + "role": "user", + "content": [{ + "type": "text", + "text": "Who are you?" + }], + }, + ], + mistral_model_config, + mistral_tokenizer, + content_format="openai", + ) + assert conversation == [ + { "role": "system", - "content": "" - }, { + "content": [{ + "type": "text", + "text": "" + }] + }, + { "role": "user", "content": [{ "type": "text", "text": "Who are you?" }] - }], - mistral_model_config, - mistral_tokenizer, - content_format="openai", - ) - assert conversation == [{ - "role": "system", - "content": [{ - "type": "text", - "text": "" - }] - }, { - "role": - "user", - "content": [{ - "type": "text", - "text": "Who are you?" - }] - }] + }, + ] @pytest.mark.asyncio @@ -287,15 +311,18 @@ async def test_parse_chat_messages_single_image_async( [{ "role": "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in the image?" - }] + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], }], phi3v_model_config, phi3v_tokenizer, @@ -318,18 +345,22 @@ def test_parse_chat_messages_multiple_images( [{ "role": "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_pil", - "image_pil": ImageAsset('cherry_blossom').pil_image - }, { - "type": "text", - "text": "What's in these images?" - }] + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "image_pil", + "image_pil": ImageAsset("cherry_blossom").pil_image, + }, + { + "type": "text", + "text": "What's in these images?" + }, + ], }], phi3v_model_config, phi3v_tokenizer, @@ -340,7 +371,7 @@ def test_parse_chat_messages_multiple_images( "role": "user", "content": - "<|image_1|>\n<|image_2|>\nWhat's in these images?" + "<|image_1|>\n<|image_2|>\nWhat's in these images?", }] _assert_mm_data_is_image_input(mm_data, 2) @@ -355,18 +386,22 @@ async def test_parse_chat_messages_multiple_images_async( [{ "role": "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_pil", - "image_pil": ImageAsset('cherry_blossom').pil_image - }, { - "type": "text", - "text": "What's in these images?" - }] + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "image_pil", + "image_pil": ImageAsset("cherry_blossom").pil_image, + }, + { + "type": "text", + "text": "What's in these images?" + }, + ], }], phi3v_model_config, phi3v_tokenizer, @@ -377,7 +412,7 @@ async def test_parse_chat_messages_multiple_images_async( "role": "user", "content": - "<|image_1|>\n<|image_2|>\nWhat's in these images?" + "<|image_1|>\n<|image_2|>\nWhat's in these images?", }] _assert_mm_data_is_image_input(await mm_future, 2) @@ -391,22 +426,26 @@ def test_parse_chat_messages_placeholder_already_in_prompt( [{ "role": "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": - "text", - "text": - "What's in <|image_1|> and how does it compare to <|image_2|>?" - }] + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": + "text", + "text": + "What's in <|image_1|> and how does it compare to <|image_2|>?", # noqa: E501 + }, + ], }], phi3v_model_config, phi3v_tokenizer, @@ -416,7 +455,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt( "role": "user", "content": - "What's in <|image_1|> and how does it compare to <|image_2|>?" + "What's in <|image_1|> and how does it compare to <|image_2|>?", }] _assert_mm_data_is_image_input(mm_data, 2) @@ -447,9 +486,9 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( "type": "text", "text": - "What's in <|image_1|> and how does it compare to the other one?" # noqa: E501 - } - ] + "What's in <|image_1|> and how does it compare to the other one?", # noqa: E501 + }, + ], }], phi3v_model_config, phi3v_tokenizer, @@ -461,7 +500,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( "user", "content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the " - "other one?" + "other one?", }] _assert_mm_data_is_image_input(mm_data, 2) @@ -472,34 +511,44 @@ def test_parse_chat_messages_multiple_images_across_messages( image_url, ): conversation, mm_data = parse_chat_messages( - [{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in this image?" - }] - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What about this one?" - }] - }], + [ + { + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What about this one?" + }, + ], + }, + ], phi3v_model_config, phi3v_tokenizer, content_format="string", @@ -527,19 +576,23 @@ def test_parse_chat_messages_context_text_format( phi3v_tokenizer, ): conversation, mm_data = parse_chat_messages( - [{ - "role": "user", - "content": [{ - "type": "text", - "text": "What's in this text?" - }] - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": "user", - "content": "What about this one?" - }], + [ + { + "role": "user", + "content": [{ + "type": "text", + "text": "What's in this text?" + }], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": "user", + "content": "What about this one?" + }, + ], phi3v_model_config, phi3v_tokenizer, content_format="openai", @@ -551,21 +604,21 @@ def test_parse_chat_messages_context_text_format( "content": [{ "type": "text", "text": "What's in this text?" - }] + }], }, { "role": "assistant", "content": [{ "type": "text", "text": "Some stuff." - }] + }], }, { "role": "user", "content": [{ "type": "text", "text": "What about this one?" - }] + }], }, ] @@ -578,31 +631,37 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message( with warnings.catch_warnings(): warnings.filterwarnings( "ignore", - message="coroutine 'async_get_and_parse_image' was never awaited") + message="coroutine 'async_get_and_parse_image' was never awaited", + ) with pytest.raises(ValueError, match="At most"): parse_chat_messages( [{ "role": "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in these images?" - }] + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + { + "type": "text", + "text": "What's in these images?" + }, + ], }], phi3v_model_config, phi3v_tokenizer, @@ -618,42 +677,54 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( with warnings.catch_warnings(): warnings.filterwarnings( "ignore", - message="coroutine 'async_get_and_parse_image' was never awaited") + message="coroutine 'async_get_and_parse_image' was never awaited", + ) with pytest.raises(ValueError, match="At most"): parse_chat_messages( - [{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in this image?" - }] - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What about these two?" - }] - }], + [ + { + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + { + "type": "text", + "text": "What about these two?" + }, + ], + }, + ], phi3v_model_config, phi3v_tokenizer, content_format="string", @@ -670,12 +741,14 @@ def test_parse_chat_messages_multiple_images_uncommon_input( "role": "user", "content": [ - "What's in these images?", { + "What's in these images?", + { "image_url": image_url - }, { + }, + { "image_url": image_url - } - ] + }, + ], }], phi3v_model_config, phi3v_tokenizer, @@ -686,7 +759,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input( "role": "user", "content": - "<|image_1|>\n<|image_2|>\nWhat's in these images?" + "<|image_1|>\n<|image_2|>\nWhat's in these images?", }] _assert_mm_data_is_image_input(mm_data, 2) @@ -700,26 +773,32 @@ def test_parse_chat_messages_multiple_images_interleave( [{ "role": "user", - "content": [{ - "type": "text", - "text": "I need you to compare this image" - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "and this one" - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "Do they have differences?" - }] + "content": [ + { + "type": "text", + "text": "I need you to compare this image", + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "and this one" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Do they have differences?" + }, + ], }], phi3v_model_config_mm_interleaved, phi3v_tokenizer, @@ -731,7 +810,7 @@ def test_parse_chat_messages_multiple_images_interleave( "user", "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501 - "Do they have differences?" + "Do they have differences?", }] _assert_mm_data_is_image_input(mm_data, 2) @@ -746,26 +825,32 @@ async def test_parse_chat_messages_multiple_images_interleave_async( [{ "role": "user", - "content": [{ - "type": "text", - "text": "I need you to compare this image" - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "and this one" - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "Do they have differences?" - }] + "content": [ + { + "type": "text", + "text": "I need you to compare this image", + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "and this one" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Do they have differences?" + }, + ], }], phi3v_model_config_mm_interleaved, phi3v_tokenizer, @@ -777,7 +862,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async( "user", "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501 - "Do they have differences?" + "Do they have differences?", }] _assert_mm_data_is_image_input(await mm_data, 2) @@ -788,135 +873,161 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( image_url, ): conversation, mm_data = parse_chat_messages( - [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's on this image?" - }, - { - "type": "image_url", - "image_url": { - "url": image_url - } - }, - { - "type": "text", - "text": "Be accurate." - }, - ] - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": - "user", - "content": [{ - "type": "text", - "text": "What's on this image?" - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }] - }], + [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Be accurate." + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + ], + }, + ], phi3v_model_config_mm_interleaved, phi3v_tokenizer, content_format="string", ) - assert conversation == [{ - "role": - "user", - "content": - "What's on this image?\n<|image_1|>\nBe accurate." - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": "user", - "content": "What's on this image?\n<|image_2|>" - }] + assert conversation == [ + { + "role": "user", + "content": "What's on this image?\n<|image_1|>\nBe accurate.", + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": "user", + "content": "What's on this image?\n<|image_2|>" + }, + ] _assert_mm_data_is_image_input(mm_data, 2) def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( - qwen25omni_model_config_mm_interleaved, qwen25omni_tokenizer, - image_url, video_url, audio_url): + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + image_url, + video_url, + audio_url, +): conversation, mm_data = parse_chat_messages( - [{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's on this image?" - }, - { - "type": "image_url", - "image_url": { - "url": image_url - } - }, - { - "type": "text", - "text": "Now listen to this audio" - }, - { - "type": "audio_url", - "audio_url": { - "url": audio_url - } - }, - ] - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": - "user", - "content": [{ - "type": "text", - "text": "What's on this image?" - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "And what's in the video?" - }, { - "type": "video_url", - "video_url": { - "url": video_url - } - }] - }], + [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Now listen to this audio" + }, + { + "type": "audio_url", + "audio_url": { + "url": audio_url + } + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "And what's in the video?" + }, + { + "type": "video_url", + "video_url": { + "url": video_url + } + }, + ], + }, + ], qwen25omni_model_config_mm_interleaved, qwen25omni_tokenizer, content_format="string", ) - assert conversation == [{ - "role": - "user", - "content": - "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" - "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>" - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": - "user", - "content": - "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" - "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>" - }] + assert conversation == [ + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", + }, + ] _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1}) @@ -929,7 +1040,8 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders( with pytest.raises( ValueError, match=r"Found more '<|image_1|>' placeholders in input prompt " - "than actual multimodal data items."): + "than actual multimodal data items.", + ): parse_chat_messages( [{ "role": @@ -952,9 +1064,9 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders( "text", "text": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501 - "Do they have differences?" + "Do they have differences?", }, - ] + ], }], phi3v_model_config_mm_interleaved, phi3v_tokenizer, @@ -973,12 +1085,15 @@ def test_mllama_single_image( [{ "role": "user", - "content": [{ - 'type': 'text', - 'text': 'The content of this image is:' - }, { - "image_url": image_url - }] + "content": [ + { + "type": "text", + "text": "The content of this image is:" + }, + { + "image_url": image_url + }, + ], }], mllama_model_config, mllama_tokenizer, @@ -986,14 +1101,17 @@ def test_mllama_single_image( ) _assert_mm_data_is_image_input(mm_data, 1) assert conversation == [{ - 'role': - 'user', - 'content': [{ - 'type': 'text', - 'text': 'The content of this image is:' - }, { - 'type': 'image' - }] + "role": + "user", + "content": [ + { + "type": "text", + "text": "The content of this image is:" + }, + { + "type": "image" + }, + ], }] @@ -1009,20 +1127,20 @@ def test_mllama_interleaved_images( "user", "content": [ { - 'type': 'text', - 'text': 'The content of the first image is:' + "type": "text", + "text": "The content of the first image is:", }, { "image_url": image_url }, { - 'type': 'text', - 'text': 'The content of the second image is:' + "type": "text", + "text": "The content of the second image is:", }, { "image_url": image_url }, - ] + ], }], mllama_model_config, mllama_tokenizer, @@ -1030,19 +1148,24 @@ def test_mllama_interleaved_images( ) _assert_mm_data_is_image_input(mm_data, 2) assert conversation == [{ - 'role': - 'user', - 'content': [{ - 'type': 'text', - 'text': 'The content of the first image is:' - }, { - 'type': 'image' - }, { - 'type': 'text', - 'text': 'The content of the second image is:' - }, { - 'type': 'image' - }] + "role": + "user", + "content": [ + { + "type": "text", + "text": "The content of the first image is:" + }, + { + "type": "image" + }, + { + "type": "text", + "text": "The content of the second image is:" + }, + { + "type": "image" + }, + ], }] @@ -1053,34 +1176,36 @@ def test_multimodal_image_parsing_matches_hf(model, image_url): def get_conversation(is_hf: bool): img_part = {"type": "image_url", "image_url": {"url": image_url}} if is_hf: - img_part = {'type': 'image'} + img_part = {"type": "image"} return [{ - 'role': - 'user', - 'content': [ + "role": + "user", + "content": [ { - 'type': 'text', - 'text': 'The content of the first image is:' + "type": "text", + "text": "The content of the first image is:", }, img_part, { - 'type': 'text', - 'text': 'The content of the second image is:' + "type": "text", + "text": "The content of the second image is:", }, img_part, { - 'type': 'text', - 'text': 'What animal is in the first image?' + "type": "text", + "text": "What animal is in the first image?", }, - ] + ], }] # Build a config for the model - model_config = ModelConfig(model, - runner="generate", - limit_mm_per_prompt={ - "image": 2, - }) + model_config = ModelConfig( + model, + runner="generate", + limit_mm_per_prompt={ + "image": 2, + }, + ) # Build the tokenizer group and grab the underlying tokenizer tokenizer_group = TokenizerGroup( @@ -1126,7 +1251,8 @@ def test_multimodal_image_parsing_matches_hf(model, image_url): [ QWEN2VL_MODEL_ID, # tokenizer.chat_template is of type str HERMES_MODEL_ID, # tokenizer.chat_template is of type dict - ]) + ], +) @pytest.mark.parametrize("use_tools", [True, False]) def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): """checks that chat_template is a dict type for HF models.""" @@ -1152,14 +1278,14 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): ) tokenizer = tokenizer_group.tokenizer - tools = [{ + tools = ([{ "type": "function", "function": { "name": "dummy_function_name", "description": "This is a dummy function", - "parameters": sample_json_schema - } - }] if use_tools else None + "parameters": sample_json_schema, + }, + }] if use_tools else None) # Test detecting the tokenizer's chat_template chat_template = resolve_hf_chat_template( diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 1954cbcbf1edd..80e2c44a02513 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -103,6 +103,7 @@ class PILImage(BaseModel): """ A PIL.Image.Image object. """ + image_pil: Image.Image model_config = ConfigDict(arbitrary_types_allowed=True) @@ -115,6 +116,7 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False): "image_pil": ImageAsset('cherry_blossom').pil_image } """ + image_pil: Required[PILImage] @@ -127,6 +129,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): "image_url": "https://example.com/image.jpg" } """ + image_url: Required[str] @@ -138,6 +141,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): "audio_url": "https://example.com/audio.mp3" } """ + audio_url: Required[str] @@ -149,6 +153,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): "video_url": "https://example.com/video.mp4" } """ + video_url: Required[str] @@ -174,19 +179,24 @@ class CustomThinkCompletionContentParam(TypedDict, total=False): ChatCompletionContentPartParam: TypeAlias = Union[ - OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, + OpenAIChatCompletionContentPartParam, + ChatCompletionContentPartAudioParam, ChatCompletionContentPartInputAudioParam, - ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam, + ChatCompletionContentPartVideoParam, + ChatCompletionContentPartRefusalParam, CustomChatCompletionContentPILImageParam, CustomChatCompletionContentSimpleImageParam, ChatCompletionContentPartImageEmbedsParam, CustomChatCompletionContentSimpleAudioParam, - CustomChatCompletionContentSimpleVideoParam, str, - CustomThinkCompletionContentParam] + CustomChatCompletionContentSimpleVideoParam, + str, + CustomThinkCompletionContentParam, +] class CustomChatCompletionMessageParam(TypedDict, total=False): """Enables custom roles in the Chat Completion API.""" + role: Required[str] """The role of the message's author.""" @@ -207,9 +217,11 @@ class CustomChatCompletionMessageParam(TypedDict, total=False): """The tool calls generated by the model, such as function calls.""" -ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam, - CustomChatCompletionMessageParam, - OpenAIHarmonyMessage] +ChatCompletionMessageParam = Union[ + OpenAIChatCompletionMessageParam, + CustomChatCompletionMessageParam, + OpenAIHarmonyMessage, +] # TODO: Make fields ReadOnly once mypy supports it @@ -262,13 +274,13 @@ def _is_var_or_elems_access( key: Optional[str] = None, ) -> bool: if isinstance(node, jinja2.nodes.Filter): - return (node.node is not None - and _is_var_or_elems_access(node.node, varname, key)) + return node.node is not None and _is_var_or_elems_access( + node.node, varname, key) if isinstance(node, jinja2.nodes.Test): return _is_var_or_elems_access(node.node, varname, key) - if (isinstance(node, jinja2.nodes.Getitem) - and isinstance(node.arg, jinja2.nodes.Slice)): + if isinstance(node, jinja2.nodes.Getitem) and isinstance( + node.arg, jinja2.nodes.Slice): return _is_var_or_elems_access(node.node, varname, key) # yapf: disable @@ -373,15 +385,18 @@ def resolve_mistral_chat_template( ) -> Optional[str]: if chat_template is not None: logger.warning_once( - "'chat_template' cannot be overridden for mistral tokenizer.") + "'chat_template' cannot be overridden for mistral tokenizer." + ) if "add_generation_prompt" in kwargs: logger.warning_once( "'add_generation_prompt' is not supported for mistral tokenizer, " - "so it will be ignored.") + "so it will be ignored." + ) if "continue_final_message" in kwargs: logger.warning_once( "'continue_final_message' is not supported for mistral tokenizer, " - "so it will be ignored.") + "so it will be ignored." + ) return None @@ -401,23 +416,35 @@ def resolve_hf_chat_template( try: processor = cached_get_processor( tokenizer.name_or_path, - processor_cls=(PreTrainedTokenizer, PreTrainedTokenizerFast, - ProcessorMixin), + processor_cls=( + PreTrainedTokenizer, + PreTrainedTokenizerFast, + ProcessorMixin, + ), trust_remote_code=model_config.trust_remote_code, ) - if isinstance(processor, ProcessorMixin) and \ - hasattr(processor, 'chat_template') and \ - processor.chat_template is not None: + if ( + isinstance(processor, ProcessorMixin) + and hasattr(processor, "chat_template") + and processor.chat_template is not None + ): return processor.chat_template except Exception: - logger.debug("Failed to load AutoProcessor chat template for %s", tokenizer.name_or_path, exc_info=True) # noqa: E501 + logger.debug( + "Failed to load AutoProcessor chat template for %s", + tokenizer.name_or_path, + exc_info=True, + ) # noqa: E501 # 3rd priority: AutoTokenizer chat template try: return tokenizer.get_chat_template(chat_template, tools=tools) except Exception: - logger.debug("Failed to load AutoTokenizer chat template for %s", - tokenizer.name_or_path, exc_info=True) + logger.debug( + "Failed to load AutoTokenizer chat template for %s", + tokenizer.name_or_path, + exc_info=True, + ) # 4th priority: Predefined fallbacks path = get_chat_template_fallback_path( @@ -425,12 +452,16 @@ def resolve_hf_chat_template( tokenizer_name_or_path=model_config.tokenizer, ) if path is not None: - logger.info("Loading chat template fallback for %s as there isn't one " - "defined on HF Hub.", tokenizer.name_or_path) + logger.info( + "Loading chat template fallback for %s as there isn't one " + "defined on HF Hub.", + tokenizer.name_or_path, + ) chat_template = load_chat_template(path) else: - logger.debug("There is no chat template fallback for %s", - tokenizer.name_or_path) + logger.debug( + "There is no chat template fallback for %s", tokenizer.name_or_path + ) return chat_template @@ -452,11 +483,17 @@ def _resolve_chat_template_content_format( else: hf_chat_template = None - jinja_text = (hf_chat_template if isinstance(hf_chat_template, str) - else load_chat_template(chat_template, is_literal=True)) + jinja_text = ( + hf_chat_template + if isinstance(hf_chat_template, str) + else load_chat_template(chat_template, is_literal=True) + ) - detected_format = ("string" if jinja_text is None else - _detect_content_format(jinja_text, default="string")) + detected_format = ( + "string" + if jinja_text is None + else _detect_content_format(jinja_text, default="string") + ) return detected_format @@ -512,7 +549,6 @@ def resolve_chat_template_content_format( return detected_format - ModalityStr = Literal["image", "audio", "video", "image_embeds"] _T = TypeVar("_T") @@ -539,6 +575,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): @cached_property def model_cls(self) -> type[SupportsMultiModal]: from vllm.model_executor.model_loader import get_model_cls + model_cls = get_model_cls(self.model_config) return cast(type[SupportsMultiModal], model_cls) @@ -574,28 +611,29 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): class MultiModalItemTracker(BaseMultiModalItemTracker[object]): - def all_mm_data(self) -> Optional[MultiModalDataDict]: if not self._items_by_modality: return None mm_inputs = {} items_by_modality = dict(self._items_by_modality) if "image" in items_by_modality and "image_embeds" in items_by_modality: - raise ValueError(\ - "Mixing raw image and embedding inputs is not allowed") + raise ValueError( + "Mixing raw image and embedding inputs is not allowed" + ) if "image_embeds" in items_by_modality: image_embeds_lst = items_by_modality["image_embeds"] if len(image_embeds_lst) > 1: - raise ValueError(\ - "Only one message can have {'type': 'image_embeds'}") + raise ValueError( + "Only one message can have {'type': 'image_embeds'}" + ) mm_inputs["image"] = image_embeds_lst[0] if "image" in items_by_modality: - mm_inputs["image"] = items_by_modality["image"] # A list of images + mm_inputs["image"] = items_by_modality["image"] # A list of images if "audio" in items_by_modality: - mm_inputs["audio"] = items_by_modality["audio"] # A list of audios + mm_inputs["audio"] = items_by_modality["audio"] # A list of audios if "video" in items_by_modality: - mm_inputs["video"] = items_by_modality["video"] # A list of videos + mm_inputs["video"] = items_by_modality["video"] # A list of videos return mm_inputs def create_parser(self) -> "BaseMultiModalContentParser": @@ -603,32 +641,33 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]): class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): - async def all_mm_data(self) -> Optional[MultiModalDataDict]: if not self._items_by_modality: return None mm_inputs = {} items_by_modality = { - modality: await asyncio.gather(*items) - for modality, items in self._items_by_modality.items() - } + modality: await asyncio.gather(*items) + for modality, items in self._items_by_modality.items() + } if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError( - "Mixing raw image and embedding inputs is not allowed") + "Mixing raw image and embedding inputs is not allowed" + ) if "image_embeds" in items_by_modality: image_embeds_lst = items_by_modality["image_embeds"] if len(image_embeds_lst) > 1: raise ValueError( - "Only one message can have {'type': 'image_embeds'}") + "Only one message can have {'type': 'image_embeds'}" + ) mm_inputs["image"] = image_embeds_lst[0] if "image" in items_by_modality: - mm_inputs["image"] = items_by_modality["image"] # A list of images + mm_inputs["image"] = items_by_modality["image"] # A list of images if "audio" in items_by_modality: - mm_inputs["audio"] = items_by_modality["audio"] # A list of audios + mm_inputs["audio"] = items_by_modality["audio"] # A list of audios if "video" in items_by_modality: - mm_inputs["video"] = items_by_modality["video"] # A list of videos + mm_inputs["video"] = items_by_modality["video"] # A list of videos return mm_inputs def create_parser(self) -> "BaseMultiModalContentParser": @@ -636,7 +675,6 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): class BaseMultiModalContentParser(ABC): - def __init__(self) -> None: super().__init__() @@ -648,8 +686,9 @@ class BaseMultiModalContentParser(ABC): # } self._placeholder_storage: dict[str, list] = defaultdict(list) - def _add_placeholder(self, modality: ModalityStr, - placeholder: Optional[str]): + def _add_placeholder( + self, modality: ModalityStr, placeholder: Optional[str] + ): mod_placeholder = MODALITY_PLACEHOLDERS_MAP[modality] if placeholder: self._placeholder_storage[mod_placeholder].append(placeholder) @@ -662,8 +701,9 @@ class BaseMultiModalContentParser(ABC): raise NotImplementedError @abstractmethod - def parse_image_embeds(self, - image_embeds: Union[str, dict[str, str]]) -> None: + def parse_image_embeds( + self, image_embeds: Union[str, dict[str, str]] + ) -> None: raise NotImplementedError @abstractmethod @@ -684,7 +724,6 @@ class BaseMultiModalContentParser(ABC): class MultiModalContentParser(BaseMultiModalContentParser): - def __init__(self, tracker: MultiModalItemTracker) -> None: super().__init__() @@ -701,8 +740,9 @@ class MultiModalContentParser(BaseMultiModalContentParser): placeholder = self._tracker.add("image", image) self._add_placeholder("image", placeholder) - def parse_image_embeds(self, - image_embeds: Union[str, dict[str, str]]) -> None: + def parse_image_embeds( + self, image_embeds: Union[str, dict[str, str]] + ) -> None: if isinstance(image_embeds, dict): embeds = { k: self._connector.fetch_image_embedding(v) @@ -741,14 +781,13 @@ class MultiModalContentParser(BaseMultiModalContentParser): class AsyncMultiModalContentParser(BaseMultiModalContentParser): - def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: super().__init__() self._tracker = tracker self._connector = MediaConnector( media_io_kwargs=self._tracker._model_config.media_io_kwargs, - allowed_local_media_path=tracker.allowed_local_media_path + allowed_local_media_path=tracker.allowed_local_media_path, ) def parse_image(self, image_url: str) -> None: @@ -757,8 +796,9 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): placeholder = self._tracker.add("image", image_coro) self._add_placeholder("image", placeholder) - def parse_image_embeds(self, - image_embeds: Union[str, dict[str, str]]) -> None: + def parse_image_embeds( + self, image_embeds: Union[str, dict[str, str]] + ) -> None: future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() if isinstance(image_embeds, dict): @@ -769,8 +809,7 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): future.set_result(embeds) if isinstance(image_embeds, str): - embedding = self._connector.\ - fetch_image_embedding(image_embeds) + embedding = self._connector.fetch_image_embedding(image_embeds) future.set_result(embedding) placeholder = self._tracker.add("image_embeds", future) @@ -809,20 +848,23 @@ def validate_chat_template(chat_template: Optional[Union[Path, str]]): return elif isinstance(chat_template, Path) and not chat_template.exists(): - raise FileNotFoundError( - "the supplied chat template path doesn't exist") + raise FileNotFoundError("the supplied chat template path doesn't exist") elif isinstance(chat_template, str): JINJA_CHARS = "{}\n" - if not any(c in chat_template - for c in JINJA_CHARS) and not Path(chat_template).exists(): + if ( + not any(c in chat_template for c in JINJA_CHARS) + and not Path(chat_template).exists() + ): raise ValueError( f"The supplied chat template string ({chat_template}) " - f"appears path-like, but doesn't exist!") + f"appears path-like, but doesn't exist!" + ) else: raise TypeError( - f"{type(chat_template)} is not a valid chat template type") + f"{type(chat_template)} is not a valid chat template type" + ) def _load_chat_template( @@ -835,8 +877,9 @@ def _load_chat_template( if is_literal: if isinstance(chat_template, Path): - raise TypeError("chat_template is expected to be read directly " - "from its value") + raise TypeError( + "chat_template is expected to be read directly from its value" + ) return chat_template @@ -849,9 +892,11 @@ def _load_chat_template( JINJA_CHARS = "{}\n" if not any(c in chat_template for c in JINJA_CHARS): - msg = (f"The supplied chat template ({chat_template}) " - f"looks like a file path, but it failed to be " - f"opened. Reason: {e}") + msg = ( + f"The supplied chat template ({chat_template}) " + f"looks like a file path, but it failed to be " + f"opened. Reason: {e}" + ) raise ValueError(msg) from e # If opening a file fails, set chat template to be args to @@ -870,8 +915,9 @@ def load_chat_template( return _cached_load_chat_template(chat_template, is_literal=is_literal) -def _get_interleaved_text_prompt(placeholder_storage: dict[str, list], - texts: list[str]) -> str: +def _get_interleaved_text_prompt( + placeholder_storage: dict[str, list], texts: list[str] +) -> str: for idx, elem in enumerate(texts): if elem in placeholder_storage: texts[idx] = placeholder_storage[elem].pop(0) @@ -881,10 +927,11 @@ def _get_interleaved_text_prompt(placeholder_storage: dict[str, list], # TODO: Let user specify how to insert multimodal tokens into prompt # (similar to chat template) -def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list], - texts: list[str], - interleave_strings: bool - ) -> str: +def _get_full_multimodal_text_prompt( + placeholder_storage: dict[str, list], + texts: list[str], + interleave_strings: bool, +) -> str: """Combine multimodal prompts for a multimodal language model.""" # flatten storage to make it looks like @@ -907,7 +954,6 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list], # Look through the text prompt to check for missing placeholders missing_placeholders: list[str] = [] for placeholder in placeholder_counts: - # For any existing placeholder in the text prompt, we leave it as is placeholder_counts[placeholder] -= text_prompt.count(placeholder) @@ -916,15 +962,18 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list], "Placeholder count is negative! " "Ensure that the 'interleave_strings' flag is disabled " "(current value: %s) " - "when manually placing image placeholders.", interleave_strings + "when manually placing image placeholders.", + interleave_strings, ) logger.debug("Input prompt: %s", text_prompt) raise ValueError( f"Found more '{placeholder}' placeholders in input prompt than " - "actual multimodal data items.") + "actual multimodal data items." + ) - missing_placeholders.extend([placeholder] * - placeholder_counts[placeholder]) + missing_placeholders.extend( + [placeholder] * placeholder_counts[placeholder] + ) # NOTE: Default behaviour: we always add missing placeholders # at the front of the prompt, if interleave_strings=False @@ -944,7 +993,8 @@ _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python _ResponsesInputImageParser = TypeAdapter( - ResponseInputImageParam).validate_python + ResponseInputImageParam +).validate_python _ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage] # Define a mapping from part types to their corresponding parsing functions. @@ -952,32 +1002,35 @@ MM_PARSER_MAP: dict[ str, Callable[[ChatCompletionContentPartParam], _ContentPart], ] = { - "text": - lambda part: _TextParser(part).get("text", None), - "thinking": - lambda part: _ThinkParser(part).get("thinking", None), - "input_text": - lambda part: _TextParser(part).get("text", None), - "input_image": - lambda part: _ResponsesInputImageParser(part).get("image_url", None), - "image_url": - lambda part: _ImageParser(part).get("image_url", {}).get("url", None), - "image_embeds": - lambda part: _ImageEmbedsParser(part).get("image_embeds", None), + "text": lambda part: _TextParser(part).get("text", None), + "thinking": lambda part: _ThinkParser(part).get("thinking", None), + "input_text": lambda part: _TextParser(part).get("text", None), + "input_image": lambda part: _ResponsesInputImageParser(part).get( + "image_url", None + ), + "image_url": lambda part: _ImageParser(part) + .get("image_url", {}) + .get("url", None), + "image_embeds": lambda part: _ImageEmbedsParser(part).get( + "image_embeds", None + ), "image_pil": lambda part: _PILImageParser(part).get("image_pil", None), - "audio_url": - lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), - "input_audio": - lambda part: _InputAudioParser(part).get("input_audio", None), - "refusal": - lambda part: _RefusalParser(part).get("refusal", None), - "video_url": - lambda part: _VideoParser(part).get("video_url", {}).get("url", None), + "audio_url": lambda part: _AudioParser(part) + .get("audio_url", {}) + .get("url", None), + "input_audio": lambda part: _InputAudioParser(part).get( + "input_audio", None + ), + "refusal": lambda part: _RefusalParser(part).get("refusal", None), + "video_url": lambda part: _VideoParser(part) + .get("video_url", {}) + .get("url", None), } def _parse_chat_message_content_mm_part( - part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]: + part: ChatCompletionContentPartParam, +) -> tuple[str, _ContentPart]: """ Parses a given multi-modal content part based on its type. @@ -993,7 +1046,8 @@ def _parse_chat_message_content_mm_part( ValueError: If the 'type' field is missing and no direct URL is found. """ assert isinstance( - part, dict) # This is needed to avoid mypy errors: part.get() from str + part, dict + ) # This is needed to avoid mypy errors: part.get() from str part_type = part.get("type", None) if isinstance(part_type, str) and part_type in MM_PARSER_MAP: @@ -1002,8 +1056,10 @@ def _parse_chat_message_content_mm_part( # Special case for 'image_url.detail' # We only support 'auto', which is the default if part_type == "image_url" and part.get("detail", "auto") != "auto": - logger.warning("'image_url.detail' is currently not supported " - "and will be ignored.") + logger.warning( + "'image_url.detail' is currently not supported " + "and will be ignored." + ) return part_type, content @@ -1011,19 +1067,22 @@ def _parse_chat_message_content_mm_part( # 'type' is required field by pydantic if part_type is None: if part.get("image_url") is not None: - image_params = cast(CustomChatCompletionContentSimpleImageParam, - part) + image_params = cast( + CustomChatCompletionContentSimpleImageParam, part + ) return "image_url", image_params.get("image_url", "") if part.get("audio_url") is not None: - audio_params = cast(CustomChatCompletionContentSimpleAudioParam, - part) + audio_params = cast( + CustomChatCompletionContentSimpleAudioParam, part + ) return "audio_url", audio_params.get("audio_url", "") if part.get("input_audio") is not None: input_audio_params = cast(dict[str, str], part) return "input_audio", input_audio_params if part.get("video_url") is not None: - video_params = cast(CustomChatCompletionContentSimpleVideoParam, - part) + video_params = cast( + CustomChatCompletionContentSimpleVideoParam, part + ) return "video_url", video_params.get("video_url", "") # Raise an error if no 'type' or direct URL is found. raise ValueError("Missing 'type' field in multimodal part.") @@ -1033,9 +1092,16 @@ def _parse_chat_message_content_mm_part( return part_type, "unknown part_type content" -VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url", - "image_embeds", "image_pil", - "audio_url", "input_audio", "video_url") +VALID_MESSAGE_CONTENT_MM_PART_TYPES = ( + "text", + "refusal", + "image_url", + "image_embeds", + "image_pil", + "audio_url", + "input_audio", + "video_url", +) def _parse_chat_message_content_parts( @@ -1055,21 +1121,20 @@ def _parse_chat_message_content_parts( part, mm_parser, wrap_dicts=wrap_dicts, - interleave_strings=interleave_strings + interleave_strings=interleave_strings, ) if parse_res: content.append(parse_res) if wrap_dicts: # Parsing wraps images and texts as interleaved dictionaries - return [ConversationMessage(role=role, - content=content)] # type: ignore + return [ConversationMessage(role=role, content=content)] # type: ignore texts = cast(list[str], content) mm_placeholder_storage = mm_parser.mm_placeholder_storage() if mm_placeholder_storage: - text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_storage, - texts, - interleave_strings) + text_prompt = _get_full_multimodal_text_prompt( + mm_placeholder_storage, texts, interleave_strings + ) else: text_prompt = "\n".join(texts) @@ -1099,13 +1164,16 @@ def _parse_chat_message_content_part( if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None: logger.warning( "Skipping multimodal part '%s' (type: '%s') " - "with empty / unparsable content.", part, part_type) + "with empty / unparsable content.", + part, + part_type, + ) return None if part_type in ("text", "input_text", "refusal", "thinking"): str_content = cast(str, content) if wrap_dicts: - return {'type': 'text', 'text': str_content} + return {"type": "text", "text": str_content} else: return str_content @@ -1137,8 +1205,12 @@ def _parse_chat_message_content_part( else: raise NotImplementedError(f"Unknown part type: {part_type}") - return {'type': modality} if wrap_dicts else ( - MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None + return ( + {"type": modality} + if wrap_dicts + else ( + MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None + ) ) @@ -1171,14 +1243,16 @@ def _parse_chat_message_content( ) for result_msg in result: - if role == 'assistant': + if role == "assistant": parsed_msg = _AssistantParser(message) # The 'tool_calls' is not None check ensures compatibility. # It's needed only if downstream code doesn't strictly # follow the OpenAI spec. - if ("tool_calls" in parsed_msg - and parsed_msg["tool_calls"] is not None): + if ( + "tool_calls" in parsed_msg + and parsed_msg["tool_calls"] is not None + ): result_msg["tool_calls"] = list(parsed_msg["tool_calls"]) elif role == "tool": parsed_msg = _ToolParser(message) @@ -1198,12 +1272,15 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None: # so, for messages that have tool_calls, parse the string (which we get # from openAI format) to dict for message in messages: - if (message["role"] == "assistant" and "tool_calls" in message - and isinstance(message["tool_calls"], list)): - + if ( + message["role"] == "assistant" + and "tool_calls" in message + and isinstance(message["tool_calls"], list) + ): for item in message["tool_calls"]: item["function"]["arguments"] = json.loads( - item["function"]["arguments"]) + item["function"]["arguments"] + ) def parse_chat_messages( @@ -1224,7 +1301,7 @@ def parse_chat_messages( content_format == "string" and model_config.multimodal_config is not None and model_config.multimodal_config.interleave_mm_strings - ) + ), ) conversation.extend(sub_messages) @@ -1252,7 +1329,7 @@ def parse_chat_messages_futures( content_format == "string" and model_config.multimodal_config is not None and model_config.multimodal_config.interleave_mm_strings - ) + ), ) conversation.extend(sub_messages) @@ -1283,10 +1360,10 @@ def apply_hf_chat_template( raise ValueError( "As of transformers v4.44, default chat template is no longer " "allowed, so you must provide a chat template if the tokenizer " - "does not define one.") + "does not define one." + ) try: - return tokenizer.apply_chat_template( conversation=conversation, # type: ignore[arg-type] tools=tools, # type: ignore[arg-type] @@ -1298,13 +1375,14 @@ def apply_hf_chat_template( # External library exceptions can sometimes occur despite the framework's # internal exception management capabilities. except Exception as e: - # Log and report any library-related exceptions for further # investigation. logger.exception( - "An error occurred in `transformers` while applying chat template") + "An error occurred in `transformers` while applying chat template" + ) raise ValueError(str(e)) from e + def apply_mistral_chat_template( tokenizer: MistralTokenizer, messages: list[ChatCompletionMessageParam], @@ -1337,26 +1415,26 @@ def apply_mistral_chat_template( # External library exceptions can sometimes occur despite the framework's # internal exception management capabilities. except Exception as e: - # Log and report any library-related exceptions for further # investigation. logger.exception( - "An error occurred in `mistral_common` while applying chat " - "template") + "An error occurred in `mistral_common` while applying chat template" + ) raise ValueError(str(e)) from e + def get_history_tool_calls_cnt(conversation: list[ConversationMessage]): idx = 0 for msg in conversation: - if msg['role'] == 'assistant': - tool_calls = msg.get('tool_calls') - idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa + if msg["role"] == "assistant": + tool_calls = msg.get("tool_calls") + idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa return idx -def make_tool_call_id(id_type:str='random', func_name=None, idx=None): - if id_type=='kimi_k2': - return f'functions.{func_name}:{idx}' +def make_tool_call_id(id_type: str = "random", func_name=None, idx=None): + if id_type == "kimi_k2": + return f"functions.{func_name}:{idx}" else: # by default return random return f"chatcmpl-tool-{random_uuid()}" diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 796b8ab5fc2cb..f506f7de16828 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -82,16 +82,26 @@ from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of, logger = init_logger(__name__) -CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest, - EmbeddingCompletionRequest, RerankRequest, - ClassificationRequest, ScoreRequest, - TokenizeCompletionRequest] +CompletionLikeRequest = Union[ + CompletionRequest, + DetokenizeRequest, + EmbeddingCompletionRequest, + RerankRequest, + ClassificationRequest, + ScoreRequest, + TokenizeCompletionRequest, +] ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest, TokenizeChatRequest] SpeechToTextRequest = Union[TranscriptionRequest, TranslationRequest] -AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest, SpeechToTextRequest, - ResponsesRequest, IOProcessorRequest] +AnyRequest = Union[ + CompletionLikeRequest, + ChatLikeRequest, + SpeechToTextRequest, + ResponsesRequest, + IOProcessorRequest, +] AnyResponse = Union[ CompletionResponse, @@ -135,6 +145,7 @@ class RequestProcessingMixin(BaseModel): Mixin for request processing, handling prompt preparation and engine input. """ + request_prompts: Optional[Sequence[RequestPrompt]] = [] engine_prompts: Optional[Union[list[EngineTokensPrompt], list[EngineEmbedsPrompt]]] = [] @@ -147,6 +158,7 @@ class ResponseGenerationMixin(BaseModel): Mixin for response generation, managing result generators and final batch results. """ + result_generator: Optional[AsyncGenerator[tuple[int, Union[ RequestOutput, PoolingRequestOutput]], None]] = None final_res_batch: list[Union[RequestOutput, PoolingRequestOutput]] = Field( @@ -155,8 +167,12 @@ class ResponseGenerationMixin(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) -class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, BaseModel, - Generic[RequestT]): +class ServeContext( + RequestProcessingMixin, + ResponseGenerationMixin, + BaseModel, + Generic[RequestT], +): # Shared across all requests request: RequestT raw_request: Optional[Request] = None @@ -298,8 +314,8 @@ class OpenAIServing: truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens", None) - if truncate_prompt_tokens is not None and \ - truncate_prompt_tokens > self.max_model_len: + if (truncate_prompt_tokens is not None + and truncate_prompt_tokens > self.max_model_len): return self.create_error_response( "truncate_prompt_tokens value is " "greater than max_model_len." @@ -344,10 +360,12 @@ class OpenAIServing: return self.create_error_response( "Request prompts not available") - self._log_inputs(request_id_item, - ctx.request_prompts[i], - params=pooling_params, - lora_request=ctx.lora_request) + self._log_inputs( + request_id_item, + ctx.request_prompts[i], + params=pooling_params, + lora_request=ctx.lora_request, + ) # Mypy has an existing bug related to inferring the variance of # TypedDicts with `builtins.enumerate`: @@ -410,10 +428,11 @@ class OpenAIServing: return self.create_error_response(str(e)) def create_error_response( - self, - message: str, - err_type: str = "BadRequestError", - status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: + self, + message: str, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + ) -> ErrorResponse: if self.log_error_stack: exc_type, _, _ = sys.exc_info() if exc_type is not None: @@ -424,10 +443,11 @@ class OpenAIServing: message=message, type=err_type, code=status_code.value)) def create_streaming_error_response( - self, - message: str, - err_type: str = "BadRequestError", - status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str: + self, + message: str, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST, + ) -> str: json_str = json.dumps( self.create_error_response(message=message, err_type=err_type, @@ -438,25 +458,25 @@ class OpenAIServing: self, request: AnyRequest, ) -> Optional[ErrorResponse]: - error_response = None if self._is_model_supported(request.model): return None if request.model in self.models.lora_requests: return None - if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and ( - load_result := await self.models.resolve_lora(request.model)): + if (envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and + (load_result := await self.models.resolve_lora(request.model))): if isinstance(load_result, LoRARequest): return None - if isinstance(load_result, ErrorResponse) and \ - load_result.error.code == HTTPStatus.BAD_REQUEST.value: + if (isinstance(load_result, ErrorResponse) and + load_result.error.code == HTTPStatus.BAD_REQUEST.value): error_response = load_result return error_response or self.create_error_response( message=f"The model `{request.model}` does not exist.", err_type="NotFoundError", - status_code=HTTPStatus.NOT_FOUND) + status_code=HTTPStatus.NOT_FOUND, + ) def _get_active_default_mm_loras( self, request: AnyRequest) -> Optional[LoRARequest]: @@ -487,7 +507,6 @@ class OpenAIServing: request: AnyRequest, supports_default_mm_loras: bool = False, ) -> Optional[LoRARequest]: - if request.model in self.models.lora_requests: return self.models.lora_requests[request.model] @@ -548,13 +567,15 @@ class OpenAIServing: prompt, add_special_tokens=add_special_tokens, truncation=True, - max_length=self.max_model_len) + max_length=self.max_model_len, + ) else: encoded = await async_tokenizer( prompt, add_special_tokens=add_special_tokens, truncation=True, - max_length=truncate_prompt_tokens) + max_length=truncate_prompt_tokens, + ) input_ids = encoded.input_ids input_text = prompt @@ -595,16 +616,22 @@ class OpenAIServing: # Note: EmbeddingRequest, ClassificationRequest, # and ScoreRequest doesn't have max_tokens - if isinstance(request, - (EmbeddingChatRequest, EmbeddingCompletionRequest, - ScoreRequest, RerankRequest, ClassificationRequest)): - + if isinstance( + request, + ( + EmbeddingChatRequest, + EmbeddingCompletionRequest, + ScoreRequest, + RerankRequest, + ClassificationRequest, + ), + ): # Note: input length can be up to the entire model context length # since these requests don't generate tokens. if token_num > self.max_model_len: operations: dict[type[AnyRequest], str] = { ScoreRequest: "score", - ClassificationRequest: "classification" + ClassificationRequest: "classification", } operation = operations.get(type(request), "embedding generation") @@ -618,8 +645,11 @@ class OpenAIServing: # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens # and does not require model context length validation - if isinstance(request, (TokenizeCompletionRequest, TokenizeChatRequest, - DetokenizeRequest)): + if isinstance( + request, + (TokenizeCompletionRequest, TokenizeChatRequest, + DetokenizeRequest), + ): return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids) @@ -639,8 +669,8 @@ class OpenAIServing: f"{token_num} input tokens. Please reduce the length of " "the input messages.") - if max_tokens is not None and \ - token_num + max_tokens > self.max_model_len: + if (max_tokens is not None + and token_num + max_tokens > self.max_model_len): raise ValueError( "'max_tokens' or 'max_completion_tokens' is too large: " f"{max_tokens}. This model's maximum context length is " @@ -745,13 +775,14 @@ class OpenAIServing: tasks = [] for prompt_input in batch_inputs: if prompt_input["is_tokens"] is False: - assert tokenizer is not None, \ - "Tokenizer is required for text prompts" + assert tokenizer is not None, ( + "Tokenizer is required for text prompts") task = self._normalize_prompt_text_to_input( request, prompt_input["content"], tokenizer=tokenizer, - add_special_tokens=add_special_tokens) + add_special_tokens=add_special_tokens, + ) else: task = self._normalize_prompt_tokens_to_input( request, prompt_input["content"], tokenizer=tokenizer) @@ -766,9 +797,14 @@ class OpenAIServing: @overload async def _preprocess_completion( self, - request: Union[DetokenizeRequest, EmbeddingCompletionRequest, - RerankRequest, ClassificationRequest, ScoreRequest, - TokenizeCompletionRequest], + request: Union[ + DetokenizeRequest, + EmbeddingCompletionRequest, + RerankRequest, + ClassificationRequest, + ScoreRequest, + TokenizeCompletionRequest, + ], tokenizer: Optional[AnyTokenizer], input_or_inputs: Union[str, list[str], list[int], list[list[int]]], add_special_tokens: bool = ..., @@ -783,8 +819,10 @@ class OpenAIServing: input_or_inputs: Optional[Union[str, list[str], list[int], list[list[int]]]], add_special_tokens: bool = ..., - ) -> tuple[list[Union[TextTokensPrompt, EmbedsPrompt]], list[Union[ - EngineTokensPrompt, EngineEmbedsPrompt]]]: + ) -> tuple[ + list[Union[TextTokensPrompt, EmbedsPrompt]], + list[Union[EngineTokensPrompt, EngineEmbedsPrompt]], + ]: ... async def _preprocess_completion( @@ -794,32 +832,38 @@ class OpenAIServing: input_or_inputs: Optional[Union[str, list[str], list[int], list[list[int]]]], add_special_tokens: bool = True, - ) -> tuple[Union[list[TextTokensPrompt], list[Union[ - TextTokensPrompt, EmbedsPrompt]]], Union[ - list[EngineTokensPrompt], list[Union[EngineTokensPrompt, - EngineEmbedsPrompt]]]]: - if not isinstance(request, - CompletionRequest) and input_or_inputs is None: + ) -> tuple[ + Union[list[TextTokensPrompt], list[Union[TextTokensPrompt, + EmbedsPrompt]]], + Union[ + list[EngineTokensPrompt], + list[Union[EngineTokensPrompt, EngineEmbedsPrompt]], + ], + ]: + if (not isinstance(request, CompletionRequest) + and input_or_inputs is None): raise ValueError( "Prompt embeds with non-completion requests is not" " currently supported.") - (request_prompts_text, request_prompts_embeds - ) = await self._tokenize_prompt_input_or_inputs_async( - request, - tokenizer, - input_or_inputs, - add_special_tokens=add_special_tokens, - ) + ( + request_prompts_text, + request_prompts_embeds, + ) = await self._tokenize_prompt_input_or_inputs_async( + request, + tokenizer, + input_or_inputs, + add_special_tokens=add_special_tokens, + ) engine_prompts_text = [ EngineTokensPrompt( prompt_token_ids=request_prompt_text["prompt_token_ids"]) for request_prompt_text in request_prompts_text ] - cache_salt = request.cache_salt if ( - hasattr(request, "cache_salt") - and request.cache_salt is not None) else None + cache_salt = (request.cache_salt if + (hasattr(request, "cache_salt") + and request.cache_salt is not None) else None) if cache_salt: for prompt_text in engine_prompts_text: prompt_text["cache_salt"] = cache_salt @@ -831,8 +875,8 @@ class OpenAIServing: # non-completion requests and if we don't add the overload here, # everywhere this function is used outside of serving_completion will # need logic asserting that only text prompts are in the request. - if not isinstance(request, - CompletionRequest) and input_or_inputs is not None: + if (not isinstance(request, CompletionRequest) + and input_or_inputs is not None): return request_prompts_text, engine_prompts_text engine_prompts_embeds = [ @@ -862,8 +906,11 @@ class OpenAIServing: chat_template_kwargs: Optional[dict[str, Any]] = None, tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None, add_special_tokens: bool = False, - ) -> tuple[list[ConversationMessage], Sequence[RequestPrompt], - list[EngineTokensPrompt]]: + ) -> tuple[ + list[ConversationMessage], + Sequence[RequestPrompt], + list[EngineTokensPrompt], + ]: model_config = self.model_config resolved_content_format = resolve_chat_template_content_format( @@ -925,8 +972,8 @@ class OpenAIServing: if tokenizer is None: assert isinstance(request_prompt, str), ( - "Prompt has to be a string", \ - "when the tokenizer is not initialised" + "Prompt has to be a string", + "when the tokenizer is not initialised", ) prompt_inputs = TextTokensPrompt(prompt=request_prompt, prompt_token_ids=[1]) @@ -943,7 +990,8 @@ class OpenAIServing: "Prompt has to be either a string or a list of token ids") prompt_inputs = TextTokensPrompt( prompt=tokenizer.decode(request_prompt), - prompt_token_ids=request_prompt) + prompt_token_ids=request_prompt, + ) engine_prompt = EngineTokensPrompt( prompt_token_ids=prompt_inputs["prompt_token_ids"]) @@ -1007,22 +1055,23 @@ class OpenAIServing: prompt_token_ids=prompt_token_ids) request_prompt = prompt_token_ids # Update the sampling params. - sampling_params.max_tokens = (self.max_model_len - - len(prompt_token_ids)) + sampling_params.max_tokens = self.max_model_len - len( + prompt_token_ids) # OPTIMIZATION priority = orig_priority - 1 @staticmethod def _load_prompt_embeds( prompt_embeds: Optional[Union[bytes, list[bytes]]], - truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, ) -> list[EmbedsPrompt]: def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: - tensor = torch.load(io.BytesIO( - pybase64.b64decode(embed, validate=True)), - weights_only=True, - map_location=torch.device("cpu")) + tensor = torch.load( + io.BytesIO(pybase64.b64decode(embed, validate=True)), + weights_only=True, + map_location=torch.device("cpu"), + ) assert isinstance(tensor, torch.Tensor) and tensor.dtype in ( torch.float32, torch.bfloat16, @@ -1061,7 +1110,7 @@ class OpenAIServing: prompt = inputs elif isinstance(inputs, list): prompt_token_ids = inputs - elif 'prompt_embeds' in inputs: + elif "prompt_embeds" in inputs: prompt_embeds = inputs.get("prompt_embeds") else: prompt = inputs["prompt"] @@ -1101,10 +1150,12 @@ class OpenAIServing: return raw_request.headers.get("X-Request-Id", default) @staticmethod - def _get_decoded_token(logprob: Logprob, - token_id: int, - tokenizer: AnyTokenizer, - return_as_token_id: bool = False) -> str: + def _get_decoded_token( + logprob: Logprob, + token_id: int, + tokenizer: AnyTokenizer, + return_as_token_id: bool = False, + ) -> str: if return_as_token_id: return f"token_id:{token_id}" @@ -1117,9 +1168,11 @@ class OpenAIServing: return True return self.models.is_base_model(model_name) - def _get_model_name(self, - model_name: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> str: + def _get_model_name( + self, + model_name: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + ) -> str: if lora_request: return lora_request.lora_name if not model_name: @@ -1129,7 +1182,7 @@ class OpenAIServing: def clamp_prompt_logprobs( prompt_logprobs: Union[PromptLogprobs, - None]) -> Union[PromptLogprobs, None]: + None], ) -> Union[PromptLogprobs, None]: if prompt_logprobs is None: return prompt_logprobs @@ -1137,6 +1190,6 @@ def clamp_prompt_logprobs( if logprob_dict is None: continue for logprob_values in logprob_dict.values(): - if logprob_values.logprob == float('-inf'): + if logprob_values.logprob == float("-inf"): logprob_values.logprob = -9999.0 return prompt_logprobs From 9480ae24e38cb73c5b665f5843ebd92c75a2039f Mon Sep 17 00:00:00 2001 From: Kyuyeun Kim <62023335+kyuyeunk@users.noreply.github.com> Date: Tue, 2 Sep 2025 10:56:31 -0700 Subject: [PATCH 32/95] [Bugfix] Fix packed_factor missing attribute error (#23902) Signed-off-by: Kyuyeun Kim --- vllm/model_executor/layers/linear.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 19ff63145024f..f24c87dbf4509 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -722,8 +722,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear): # If quantized, we need to adjust the offset and size to account # for the packing. if packed_dim == output_dim: - shard_size = shard_size // param.pack_factor - shard_offset = shard_offset // param.pack_factor + shard_size = shard_size // param.packed_factor + shard_offset = shard_offset // param.packed_factor # Special case for Marlin. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) @@ -756,8 +756,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear): # for the packing. packed_dim = getattr(param, "packed_dim", None) if packed_dim == output_dim: - shard_size = shard_size // param.pack_factor - shard_offset = shard_offset // param.pack_factor + shard_size = shard_size // param.packed_factor + shard_offset = shard_offset // param.packed_factor # Special case for Marlin. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) @@ -1107,8 +1107,8 @@ class QKVParallelLinear(ColumnParallelLinear): # If quantized, we need to adjust the offset and size to account # for the packing. if packed_dim == output_dim: - shard_size = shard_size // param.pack_factor - shard_offset = shard_offset // param.pack_factor + shard_size = shard_size // param.packed_factor + shard_offset = shard_offset // param.packed_factor # Special case for Marlin. shard_size, shard_offset = adjust_marlin_shard( @@ -1155,8 +1155,8 @@ class QKVParallelLinear(ColumnParallelLinear): # for the packing. packed_dim = getattr(param, "packed_dim", None) if packed_dim == output_dim: - shard_size = shard_size // param.pack_factor - shard_offset = shard_offset // param.pack_factor + shard_size = shard_size // param.packed_factor + shard_offset = shard_offset // param.packed_factor # Special case for Marlin. shard_size, shard_offset = adjust_marlin_shard( From 2417798471af8521e488c04f3a43c91e6836a705 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Tue, 2 Sep 2025 19:10:10 +0100 Subject: [PATCH 33/95] [Metrics] Deprecate TPOT in favor of ITL (#24110) Signed-off-by: Mark McLoughlin --- .../prometheus_grafana/grafana.json | 12 +++++----- tests/entrypoints/openai/test_metrics.py | 22 +++++++++++++------ vllm/engine/llm_engine.py | 8 +++---- vllm/engine/metrics.py | 18 +++++++++++++-- vllm/engine/metrics_types.py | 2 +- vllm/v1/metrics/loggers.py | 22 ++++++++++++++++--- vllm/v1/metrics/stats.py | 6 ++--- 7 files changed, 64 insertions(+), 26 deletions(-) diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json index 3488956a5b24c..37abc9de926fd 100644 --- a/examples/online_serving/prometheus_grafana/grafana.json +++ b/examples/online_serving/prometheus_grafana/grafana.json @@ -402,7 +402,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -418,7 +418,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -435,7 +435,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -452,7 +452,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -468,7 +468,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", + "expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "Mean", @@ -476,7 +476,7 @@ "refId": "E" } ], - "title": "Time Per Output Token Latency", + "title": "Inter Token Latency", "type": "timeseries" }, { diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index ff2e7004ff9f8..a4e1aca8bcac2 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [ "vllm:request_params_max_tokens_sum", "vllm:request_params_max_tokens_bucket", "vllm:request_params_max_tokens_count", - "vllm:time_to_first_token_seconds_sum", - "vllm:time_to_first_token_seconds_bucket", - "vllm:time_to_first_token_seconds_count", "vllm:time_per_output_token_seconds_sum", "vllm:time_per_output_token_seconds_bucket", "vllm:time_per_output_token_seconds_count", + "vllm:time_to_first_token_seconds_sum", + "vllm:time_to_first_token_seconds_bucket", + "vllm:time_to_first_token_seconds_count", + "vllm:inter_token_latency_seconds_sum", + "vllm:inter_token_latency_seconds_bucket", + "vllm:inter_token_latency_seconds_count", "vllm:e2e_request_latency_seconds_sum", "vllm:e2e_request_latency_seconds_bucket", "vllm:e2e_request_latency_seconds_count", @@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [ "vllm:request_decode_time_seconds_count", ] -HIDDEN_DEPRECATED_METRICS: list[str] = [] +HIDDEN_DEPRECATED_METRICS: list[str] = [ + "vllm:time_per_output_token_seconds_sum", + "vllm:time_per_output_token_seconds_bucket", + "vllm:time_per_output_token_seconds_count", +] @pytest.mark.asyncio @@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer, assert response.status_code == HTTPStatus.OK for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS): - if (not server.show_hidden_metrics - and metric not in HIDDEN_DEPRECATED_METRICS): - assert metric in response.text + if (metric in HIDDEN_DEPRECATED_METRICS + and not server.show_hidden_metrics): + continue + assert metric in response.text @pytest.mark.asyncio diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 10ded6f16d41c..47f56e58130fa 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1414,7 +1414,7 @@ class LLMEngine: num_generation_tokens_iter = 0 num_tokens_iter = 0 time_to_first_tokens_iter: List[float] = [] - time_per_output_tokens_iter: List[float] = [] + inter_token_latencies_iter: List[float] = [] num_preemption_iter = (0 if scheduler_outputs is None else scheduler_outputs.preempted) @@ -1498,9 +1498,9 @@ class LLMEngine: num_generation_tokens_from_prefill_groups += ( seq_group.num_seqs()) else: - # TPOTs. + # ITLs latency = seq_group.get_last_token_latency() - time_per_output_tokens_iter.append(latency) + inter_token_latencies_iter.append(latency) if seq_group.state.current_step == 0: # For async_output_proc, the do_log_stats() # is called following init_multi_step(), which @@ -1582,7 +1582,7 @@ class LLMEngine: num_generation_tokens_iter=num_generation_tokens_iter, num_tokens_iter=num_tokens_iter, time_to_first_tokens_iter=time_to_first_tokens_iter, - time_per_output_tokens_iter=time_per_output_tokens_iter, + inter_token_latencies_iter=inter_token_latencies_iter, num_preemption_iter=num_preemption_iter, # Request stats diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index ba8dbd1fad791..0a8709db40880 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -113,9 +113,21 @@ class Metrics: 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0 ]) + # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds + # TODO: in 0.12, only enable if show_hidden_metrics=True self.histogram_time_per_output_token = self._histogram_cls( name="vllm:time_per_output_token_seconds", - documentation="Histogram of time per output token in seconds.", + documentation=( + "Histogram of time per output token in seconds." + "DEPRECATED: Use vllm:inter_token_latency_seconds instead."), + labelnames=labelnames, + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 + ]) + self.histogram_inter_token_latency = self._histogram_cls( + name="vllm:inter_token_latency_seconds", + documentation="Histogram of inter token latency in seconds.", labelnames=labelnames, buckets=[ 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, @@ -491,7 +503,9 @@ class PrometheusStatLogger(StatLoggerBase): self._log_histogram(self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter) self._log_histogram(self.metrics.histogram_time_per_output_token, - stats.time_per_output_tokens_iter) + stats.inter_token_latencies_iter) + self._log_histogram(self.metrics.histogram_inter_token_latency, + stats.inter_token_latencies_iter) # Request level data # Latency diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 3281a9121a9df..9778ab5a8c99b 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -43,7 +43,7 @@ class Stats: num_generation_tokens_iter: int num_tokens_iter: int time_to_first_tokens_iter: List[float] - time_per_output_tokens_iter: List[float] + inter_token_latencies_iter: List[float] num_preemption_iter: int # Request stats (should have _requests suffix) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 3b0616952babf..41e07a00564aa 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -377,9 +377,13 @@ class PrometheusStatLogger(StatLoggerBase): self.histogram_time_to_first_token = make_per_engine( histogram_time_to_first_token, engine_indexes, model_name) + # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds + # TODO: in 0.12, only enable if show_hidden_metrics=True histogram_time_per_output_token = self._histogram_cls( name="vllm:time_per_output_token_seconds", - documentation="Histogram of time per output token in seconds.", + documentation=( + "Histogram of time per output token in seconds." + "DEPRECATED: Use vllm:inter_token_latency_seconds instead."), buckets=[ 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 @@ -388,6 +392,17 @@ class PrometheusStatLogger(StatLoggerBase): self.histogram_time_per_output_token = make_per_engine( histogram_time_per_output_token, engine_indexes, model_name) + histogram_inter_token_latency = self._histogram_cls( + name="vllm:inter_token_latency_seconds", + documentation="Histogram of inter-token latency in seconds.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 + ], + labelnames=labelnames) + self.histogram_inter_token_latency = make_per_engine( + histogram_inter_token_latency, engine_indexes, model_name) + request_latency_buckets = [ 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 @@ -537,8 +552,9 @@ class PrometheusStatLogger(StatLoggerBase): self.histogram_n_request[engine_idx].observe(n_param) for ttft in iteration_stats.time_to_first_tokens_iter: self.histogram_time_to_first_token[engine_idx].observe(ttft) - for tpot in iteration_stats.time_per_output_tokens_iter: - self.histogram_time_per_output_token[engine_idx].observe(tpot) + for itl in iteration_stats.inter_token_latencies_iter: + self.histogram_inter_token_latency[engine_idx].observe(itl) + self.histogram_time_per_output_token[engine_idx].observe(itl) for finished_request in iteration_stats.finished_requests: self.counter_request_success[ diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 95094bda65cde..45c32aaaaf6c4 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -96,7 +96,7 @@ class IterationStats: self.max_num_generation_tokens_iter: list[int] = [] self.n_params_iter: list[int] = [] self.time_to_first_tokens_iter: list[float] = [] - self.time_per_output_tokens_iter: list[float] = [] + self.inter_token_latencies_iter: list[float] = [] self.waiting_lora_adapters: dict[str, int] = {} self.running_lora_adapters: dict[str, int] = {} @@ -128,8 +128,8 @@ class IterationStats: if is_prefilling: req_stats.first_token_ts = engine_core_timestamp else: - tpot = engine_core_timestamp - req_stats.last_token_ts - self.time_per_output_tokens_iter.append(tpot) + itl = engine_core_timestamp - req_stats.last_token_ts + self.inter_token_latencies_iter.append(itl) req_stats.last_token_ts = engine_core_timestamp From 598bd74cf80f91d0de422d6ac994034d86b9e31c Mon Sep 17 00:00:00 2001 From: nathan <97126670+nathanrchn@users.noreply.github.com> Date: Tue, 2 Sep 2025 20:34:28 +0200 Subject: [PATCH 34/95] Fix weights loading for Apertus (#24100) Signed-off-by: Nathan Ranchin --- vllm/model_executor/models/apertus.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 0de683d2cd060..f6400b05e110a 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -415,6 +415,12 @@ class ApertusModel(nn.Module): (".qkv_proj", ".v_proj", "v"), ] params_dict = dict(self.named_parameters()) + + # we need to load the buffers for beta and eps (XIELU) + for name, buffer in self.named_buffers(): + if name.endswith(".beta") or name.endswith(".eps"): + params_dict[name] = buffer + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: From 98aee612aa13155badc2747bd51b378d6e515958 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 2 Sep 2025 14:53:34 -0400 Subject: [PATCH 35/95] [Log] Only Print Profiler Results on Rank 0 (#23370) Signed-off-by: yewentao256 --- vllm/v1/worker/gpu_worker.py | 6 ++++-- vllm/worker/worker.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f49f5bdd9703b..cb000d53a923d 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -400,8 +400,10 @@ class Worker(WorkerBase): self.profiler.start() else: self.profiler.stop() - print(self.profiler.key_averages().table( - sort_by="self_cuda_time_total")) + # only print profiler results on rank 0 + if self.local_rank == 0: + print(self.profiler.key_averages().table( + sort_by="self_cuda_time_total")) def execute_dummy_batch(self) -> None: self.model_runner._dummy_run(1) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 2e20c89c632c5..2d2e51c329e74 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -128,8 +128,10 @@ class Worker(LocalOrDistributedWorkerBase): if self.profiler is None: raise RuntimeError("Profiler is not enabled.") self.profiler.stop() - print( - self.profiler.key_averages().table(sort_by="self_cuda_time_total")) + # only print profiler results on rank 0 + if self.local_rank == 0: + print(self.profiler.key_averages().table( + sort_by="self_cuda_time_total")) def sleep(self, level: int = 1) -> None: free_bytes_before_sleep = torch.cuda.mem_get_info()[0] From d328f7894f140fdc643dc1aa5fe80f4596e6f418 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Tue, 2 Sep 2025 22:15:06 +0200 Subject: [PATCH 36/95] [CI] Enable all hf transformers baselines in test_hybrid (#23936) Signed-off-by: Thomas Parnell --- .../models/language/generation/test_hybrid.py | 76 ++++++------------- tests/models/registry.py | 13 +++- 2 files changed, 32 insertions(+), 57 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 3cacbdcfbe86e..9e97e3fa65775 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -34,17 +34,6 @@ HYBRID_MODELS = [ "LiquidAI/LFM2-1.2B", ] -HF_UNSUPPORTED_MODELS = [ - # The HF transformers implementation of - # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test - # doesn't compare vLLM output with HF output. - # See https://github.com/huggingface/transformers/pull/35943 - "yujiepan/mamba2-codestral-v0.1-tiny-random", - # transformers 4.55 is still producing garbage for this model - # TODO(tdoublep): follow-up on transformers side - "ibm-granite/granite-4.0-tiny-preview" -] - V1_SUPPORTED_MODELS = [ "state-spaces/mamba-130m-hf", "ai21labs/Jamba-tiny-dev", @@ -90,20 +79,13 @@ def test_models( try: model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - hf_version_check = model_info.check_transformers_version( - on_fail="return") + model_info.check_transformers_version(on_fail="skip") except ValueError: - hf_version_check = None - - if hf_version_check is not None: - print(f"Skipping transformers comparison because: {hf_version_check}") + pass with hf_runner(model) as hf_model: - if model not in HF_UNSUPPORTED_MODELS and hf_version_check is None: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) - else: - hf_outputs = None + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "0") @@ -121,7 +103,7 @@ def test_models( else: vllm_v1_outputs = None - if hf_outputs is not None and vllm_v0_outputs is not None: + if vllm_v0_outputs is not None: check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v0_outputs, @@ -130,12 +112,10 @@ def test_models( ) if model in V1_SUPPORTED_MODELS: - ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs - assert ref_outputs is not None check_logprobs_close( - outputs_0_lst=ref_outputs, + outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v1_outputs, - name_0="hf" if hf_outputs is not None else "vllm-v0", + name_0="hf", name_1="vllm-v1", ) @@ -402,11 +382,8 @@ def test_full_cuda_graph( pass with hf_runner(model) as hf_model: - if model not in HF_UNSUPPORTED_MODELS: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) - else: - hf_outputs = None + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "0") @@ -421,7 +398,7 @@ def test_full_cuda_graph( vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - if hf_outputs is not None and vllm_v0_outputs is not None: + if vllm_v0_outputs is not None: check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v0_outputs, @@ -429,12 +406,10 @@ def test_full_cuda_graph( name_1="vllm-v0", ) - ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs - assert ref_outputs is not None check_logprobs_close( - outputs_0_lst=ref_outputs, + outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v1_outputs, - name_0="hf" if hf_outputs is not None else "vllm-v0", + name_0="hf", name_1="vllm-v1", ) @@ -460,11 +435,8 @@ def test_fp32_state( pass with hf_runner(model) as hf_model: - if model not in HF_UNSUPPORTED_MODELS: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) - else: - hf_outputs = None + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "0") @@ -480,18 +452,16 @@ def test_fp32_state( vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - if hf_outputs is not None: - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_v0_outputs, - name_0="hf", - name_1="vllm-v0", - ) - - ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs check_logprobs_close( - outputs_0_lst=ref_outputs, + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_v0_outputs, + name_0="hf", + name_1="vllm-v0", + ) + + check_logprobs_close( + outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v1_outputs, - name_0="hf" if hf_outputs is not None else "vllm-v0", + name_0="hf", name_1="vllm-v1", ) diff --git a/tests/models/registry.py b/tests/models/registry.py index 3b5cec2dc7022..4cf3dd6e08ced 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -154,7 +154,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5", trust_remote_code=True), "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1", - min_transformers_version="4.56.0", + min_transformers_version="4.55.3", extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501 "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m", {"1b": "bigscience/bloomz-1b1"}), @@ -208,7 +208,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "GptOssForCausalLM": _HfExamplesInfo("lmsys/gpt-oss-20b-bf16"), "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"), "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"), - "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"), # noqa: E501 + "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview", # noqa: E501 + min_transformers_version="4.55.3"), "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"), # noqa: E501 "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1", trust_remote_code=True), @@ -228,7 +229,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini", - min_transformers_version="4.56.0", + min_transformers_version="4.55.3", extras={ "tiny": "ai21labs/Jamba-tiny-dev", "random": "ai21labs/Jamba-tiny-random", # noqa: E501 @@ -244,7 +245,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 is_available_online=False), "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"), - "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"), + "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1", + min_transformers_version="4.55.3", + extras={ + "random": "yujiepan/mamba2-codestral-v0.1-tiny-random", # noqa: E501 + }), "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501 "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16", trust_remote_code=True), From 457e4719710e44014fb8b2eaf668d0c5cfa145c1 Mon Sep 17 00:00:00 2001 From: rasmith Date: Tue, 2 Sep 2025 17:13:57 -0500 Subject: [PATCH 37/95] [AMD][Kernel][Bugfix] Cast offsets tensor bn to tl.int64 to avoid GPU segfault (#23692) Signed-off-by: Randall Smith --- vllm/attention/ops/prefix_prefill.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index e1d41930f6231..a70db89cdb76e 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -146,7 +146,7 @@ def _fwd_kernel(Q, start_n = tl.multiple_of(start_n, BLOCK_SIZE) # -- compute qk ---- bn = tl.load(B_Loc + cur_batch * stride_b_loc_b + - (start_n // BLOCK_SIZE) * stride_b_loc_s) + (start_n // BLOCK_SIZE) * stride_b_loc_s).to(tl.int64) # [D,BLOCK_SIZE] off_k = ( bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h + @@ -367,7 +367,7 @@ def _fwd_kernel_flash_attn_v2( bn = tl.load(B_Loc + cur_batch * stride_b_loc_b + ((start_n + offs_n) // block_size) * stride_b_loc_s, mask=(start_n + offs_n) < cur_batch_ctx_len, - other=0) + other=0).to(tl.int64) off_k = ( bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h + (offs_d[:, None] // x) * stride_k_cache_d + @@ -575,7 +575,7 @@ def _fwd_kernel_alibi( bn = tl.load(B_Loc + cur_batch * stride_b_loc_b + ((start_n + offs_n) // block_size) * stride_b_loc_s, mask=(start_n + offs_n) < cur_batch_ctx_len, - other=0) + other=0).to(tl.int64) off_k = ( bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h + (offs_d[:, None] // x) * stride_k_cache_d + From 930a24144c073a08cfecabd75a242e713bc4f57e Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 2 Sep 2025 18:22:30 -0400 Subject: [PATCH 38/95] [Bug] R1 Accuracy: Fix `routed_scaling_factor` Double Mul Issue (#24119) Signed-off-by: yewentao256 --- vllm/model_executor/models/deepseek_v2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 36c9427e474e9..3a8eaf681733d 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -160,7 +160,8 @@ class DeepseekV2MoE(nn.Module): topk_group=config.topk_group, prefix=f"{prefix}.experts", scoring_func=config.scoring_func, - routed_scaling_factor=self.routed_scaling_factor, + # we do scaling outside, set factor to 1.0 to avoid double mul + routed_scaling_factor=1.0, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts) From 2fd1a40a54cf9a5af6f0a8ce4700faf4a1a5108b Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 2 Sep 2025 19:50:28 -0400 Subject: [PATCH 39/95] [CI/Build] Disable SiluMul NVFP4 quant fusion tests (#24121) Signed-off-by: Matthew Bonanni --- .buildkite/test-pipeline.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 55349e0ac9321..be7044c41a732 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -666,7 +666,7 @@ steps: # Quantization - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py + # - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py @@ -676,7 +676,7 @@ steps: - pytest -v -s tests/compile/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern - pytest -v -s tests/kernels/moe/test_flashinfer.py - - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + # - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py ##### 1 GPU test ##### ##### multi gpus test ##### From 862f2ef893d9751db0a92bd2d4ae0e3d9677872f Mon Sep 17 00:00:00 2001 From: Chaojun Zhang Date: Wed, 3 Sep 2025 08:21:18 +0800 Subject: [PATCH 40/95] [XPU] Fix the bug of LoRA logits on the XPU platform (#24081) Signed-off-by: chzhang --- vllm/lora/layers.py | 2 +- vllm/lora/punica_wrapper/punica_xpu.py | 13 ++++++++++--- vllm/platforms/xpu.py | 5 ++++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index d8503b20459f6..6e4b69c303254 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1151,7 +1151,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): lora_logits = lora_logits.mT indices_padded = self.punica_wrapper.sampler_indices_padded - if current_platform.is_tpu(): + if current_platform.is_tpu() or current_platform.is_xpu(): indices_padded = indices_padded[:logits.size(0)] lora_logits = (lora_logits.reshape( diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py index 572e39e0eced0..163bb412235ce 100644 --- a/vllm/lora/punica_wrapper/punica_xpu.py +++ b/vllm/lora/punica_wrapper/punica_xpu.py @@ -225,6 +225,13 @@ class PunicaWrapperXPU(PunicaWrapperBase): add_inputs=True, **kwargs) + @property + def sampler_indices_padded(self) -> torch.Tensor: + """ + This property provides access to padded sampler indices. + """ + return self._sampler_indices_padded[:] + def add_lora_logits(self, y: torch.Tensor, x: torch.Tensor, @@ -259,11 +266,11 @@ class PunicaWrapperXPU(PunicaWrapperBase): buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) - - bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale) + sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0)) + bgmv_shrink(x, lora_a_stacked, buffer, sampler_indices, scale) bgmv_expand(buffer, lora_b_stacked, y, - self.sampler_indices, + sampler_indices, add_inputs=True) return y.view_as(y_org) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index d61b921e19cfe..645a9e63a4e5a 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -91,7 +91,7 @@ class XPUPlatform(Platform): cache_config.block_size = 64 # lazy import to avoid circular import - from vllm.config import CUDAGraphMode + from vllm.config import CompilationLevel, CUDAGraphMode compilation_config = vllm_config.compilation_config if compilation_config.cudagraph_mode is None or \ compilation_config.cudagraph_mode.max_cudagraph_mode() \ @@ -100,6 +100,9 @@ class XPUPlatform(Platform): "cudagraphs. Fallback to cudagraph_mode=NONE") compilation_config.cudagraph_mode = CUDAGraphMode.NONE + if vllm_config.lora_config is not None: + compilation_config.level = CompilationLevel.NO_COMPILATION + # check and update parallel config parallel_config = vllm_config.parallel_config parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker" From 42dc59dbaceb2b9aa1477e9b3e0c33b379678468 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 3 Sep 2025 10:09:19 +0800 Subject: [PATCH 41/95] Update release pipeline post PyTorch 2.8.0 update (#24073) Signed-off-by: Huy Do Signed-off-by: youkaichao Co-authored-by: Huy Do --- .buildkite/release-pipeline.yaml | 32 ++++++++++++++--------------- .buildkite/scripts/upload-wheels.sh | 22 +++++++++++--------- tools/install_deepgemm.sh | 2 +- 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 92a1bcada3879..53b5b23db3c21 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,21 +1,24 @@ steps: - # aarch64 + CUDA builds - - label: "Build arm64 wheel - CUDA 12.8" - id: build-wheel-arm64-cuda-12-8 + # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9 + - label: "Build arm64 wheel - CUDA 12.9" + id: build-wheel-arm64-cuda-12-9 agents: queue: arm64_cpu_queue_postmerge commands: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" - # x86 + CUDA builds + - block: "Build CUDA 12.8 wheel" + key: block-build-cu128-wheel + - label: "Build wheel - CUDA 12.8" + depends_on: block-build-cu128-wheel id: build-wheel-cuda-12-8 agents: queue: cpu_queue_postmerge @@ -44,18 +47,14 @@ steps: env: DOCKER_BUILDKIT: "1" - # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working. - # However, this block can be uncommented to save some compute hours. - # - block: "Build CUDA 11.8 wheel" - # key: block-build-cu118-wheel - - - label: "Build wheel - CUDA 11.8" - # depends_on: block-build-cu118-wheel - id: build-wheel-cuda-11-8 + # x86 + CUDA builds + - label: "Build wheel - CUDA 12.9" + depends_on: ~ + id: build-wheel-cuda-12-9 agents: queue: cpu_queue_postmerge commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" @@ -75,6 +74,7 @@ steps: - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" + # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9 - label: "Build release image (arm64)" depends_on: ~ id: build-release-image-arm64 @@ -82,7 +82,7 @@ steps: queue: arm64_cpu_queue_postmerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" # Add job to create multi-arch manifest @@ -103,7 +103,7 @@ steps: - create-multi-arch-manifest - build-wheel-cuda-12-8 - build-wheel-cuda-12-6 - - build-wheel-cuda-11-8 + - build-wheel-cuda-12-9 id: annotate-release-workflow agents: queue: cpu_queue_postmerge diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 745f285c008ad..43aa8c47be299 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -58,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel" aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" -if [[ $normal_wheel == *"cu118"* ]]; then - # if $normal_wheel matches cu118, do not upload the index.html - echo "Skipping index files for cu118 wheels" -elif [[ $normal_wheel == *"cu126"* ]]; then +if [[ $normal_wheel == *"cu126"* ]]; then # if $normal_wheel matches cu126, do not upload the index.html echo "Skipping index files for cu126 wheels" +elif [[ $normal_wheel == *"cu128"* ]]; then + # if $normal_wheel matches cu128, do not upload the index.html + echo "Skipping index files for cu128 wheels" else - # only upload index.html for cu128 wheels (default wheels) + # only upload index.html for cu129 wheels (default wheels) as it + # is available on both x86 and arm64 aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" fi @@ -74,14 +75,15 @@ fi aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" -if [[ $normal_wheel == *"cu118"* ]]; then - # if $normal_wheel matches cu118, do not upload the index.html - echo "Skipping index files for cu118 wheels" -elif [[ $normal_wheel == *"cu126"* ]]; then +if [[ $normal_wheel == *"cu126"* ]]; then # if $normal_wheel matches cu126, do not upload the index.html echo "Skipping index files for cu126 wheels" +elif [[ $normal_wheel == *"cu128"* ]]; then + # if $normal_wheel matches cu128, do not upload the index.html + echo "Skipping index files for cu128 wheels" else - # only upload index.html for cu128 wheels (default wheels) + # only upload index.html for cu129 wheels (default wheels) as it + # is available on both x86 and arm64 aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" fi diff --git a/tools/install_deepgemm.sh b/tools/install_deepgemm.sh index b125cda96f179..98427f1835ec2 100755 --- a/tools/install_deepgemm.sh +++ b/tools/install_deepgemm.sh @@ -105,4 +105,4 @@ fi popd -echo "✅ DeepGEMM installation completed successfully" \ No newline at end of file +echo "✅ DeepGEMM installation completed successfully" From e32a0e86781bd6dc0d5cf267cf177c762bf96ffa Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 2 Sep 2025 22:32:59 -0400 Subject: [PATCH 42/95] Upgrade xgrammar to 0.1.23 (#22988) Signed-off-by: Russell Bryant --- requirements/common.txt | 2 +- vllm/v1/worker/gpu_model_runner.py | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/requirements/common.txt b/requirements/common.txt index e21abfb9a30bd..ce0795488cc1e 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -25,7 +25,7 @@ outlines == 0.1.11 ; platform_machine == "s390x" # required for outlines backend disk cache diskcache == 5.6.3 lark == 1.2.2 -xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" +xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 96dafd6add679..c81bc58f1ef46 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -90,15 +90,11 @@ from .utils import (AttentionGroup, MultiModalBudget, if TYPE_CHECKING: import xgrammar as xgr - import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile # noqa: E501 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.v1.core.sched.output import SchedulerOutput else: xgr = LazyLoader("xgr", globals(), "xgrammar") - xgr_torch_compile = LazyLoader( - "xgr_torch_compile", globals(), - "xgrammar.kernels.apply_token_bitmask_inplace_torch_compile") logger = init_logger(__name__) @@ -1333,10 +1329,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # so we receive it in that format. grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous() - # Force use of the torch.compile implementation from xgrammar to work - # around issues with the Triton kernel in concurrent structured output - # scenarios. See PR #19565 and issues #19493, #18376 for details. - xgr_torch_compile.apply_token_bitmask_inplace_torch_compile( + xgr.apply_token_bitmask_inplace( logits, grammar_bitmask.to(self.device, non_blocking=True), indices=out_indices if not skip_out_indices else None, From 136d853e65a91a21b08227217b51daaba2d5cc71 Mon Sep 17 00:00:00 2001 From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Date: Tue, 2 Sep 2025 22:52:51 -0400 Subject: [PATCH 43/95] [V1] Wrapper which plumbs request-level logits processors into vLLM batch-level logits processing (#23656) Signed-off-by: Andrew Feldman --- .../custom.py} | 0 .../logits_processor/custom_req.py | 151 ++++++++++++++++ .../logits_processor/custom_req_init.py | 165 ++++++++++++++++++ .../logits_processors/test_custom_offline.py | 33 ++++ tests/v1/logits_processors/utils.py | 67 ++++++- vllm/v1/sample/logits_processor/__init__.py | 113 +++++++++++- 6 files changed, 524 insertions(+), 5 deletions(-) rename examples/offline_inference/{logits_processor.py => logits_processor/custom.py} (100%) create mode 100644 examples/offline_inference/logits_processor/custom_req.py create mode 100644 examples/offline_inference/logits_processor/custom_req_init.py diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor/custom.py similarity index 100% rename from examples/offline_inference/logits_processor.py rename to examples/offline_inference/logits_processor/custom.py diff --git a/examples/offline_inference/logits_processor/custom_req.py b/examples/offline_inference/logits_processor/custom_req.py new file mode 100644 index 0000000000000..4c19bb4ce2bae --- /dev/null +++ b/examples/offline_inference/logits_processor/custom_req.py @@ -0,0 +1,151 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""This example demonstrates wrapping a request-level logits processor to be +compatible with vLLM's batch-level logits processing + +For demo purposes, a dummy logits processor is employed which, if +`target_token` is passed as a keyword argument to `SamplingParams.extra_args`, +will mask out all tokens except `target_token`. This logits processor can be +applied to a vector of logits associated with a single decode step for a single +request. The logits processor cannot be applied to a request which does not +pass in a `target_token` custom argument. + +The request-level dummy logits processor is wrapped to create a batch-level +logits processor, which can apply the logits processor to output logits from +all requests in the persistent batch in a given decode step. For requests which +do not provide a `target_token` argument, the corresponding row of `logits` +will not be modified. + +A batch is constructed with `temperature=0.0` and 50% of requests specifying +`target_token`, and for these requests - and *only* these requests - we +expect the `target_token` to be decoded in each step, yielding an output +similar to that shown below: + +Generated Outputs: +------------------------------------------------------------ +Prompt: 'Hello, my name is' +Output: " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '" +------------------------------------------------------------ +Prompt: 'The president of the United States is' +Output: " not a racist. He is a racist.\nHe's a racist because he" +------------------------------------------------------------ +Prompt: 'The capital of France is' +Output: ' also also also also also also also also also also also also also + also also also' +------------------------------------------------------------ +Prompt: 'The future of AI is' +Output: ' in the hands of the people.\n\nThe future of AI is in the' +------------------------------------------------------------ +""" + +from typing import Any, Optional + +import torch + +from vllm import LLM, SamplingParams +from vllm.logger import init_logger +from vllm.v1.sample.logits_processor import ( + AdapterLogitsProcessor, + RequestLogitsProcessor, +) + +logger = init_logger(__name__) + + +class DummyPerReqLogitsProcessor: + """The request-level logits processor masks out all logits except the + token id identified by `target_token`""" + + def __init__(self, target_token: int) -> None: + """Specify `target_token`""" + self.target_token = target_token + + def __call__( + self, + output_ids: list[int], + logits: torch.Tensor, + ) -> torch.Tensor: + val_to_keep = logits[self.target_token].item() + logits[:] = float("-inf") + logits[self.target_token] = val_to_keep + return logits + + +class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor): + """Example of wrapping a fake request-level logit processor to create a + batch-level logits processor""" + + def is_argmax_invariant(self) -> bool: + return False + + def new_req_logits_processor( + self, + params: SamplingParams, + ) -> Optional[RequestLogitsProcessor]: + """This method returns a new request-level logits processor, customized + to the `target_token` value associated with a particular request. + + Returns None if the logits processor should not be applied to the + particular request. To use the logits processor the request must have + a "target_token" custom argument with an integer value. + + Args: + params: per-request sampling params + + Returns: + `Callable` request logits processor, or None + """ + target_token: Optional[Any] = params.extra_args and params.extra_args.get( + "target_token" + ) + if target_token is None: + return None + if not isinstance(target_token, int): + logger.warning( + "target_token value %s is not int; not applying logits" + " processor to request.", + target_token, + ) + return None + return DummyPerReqLogitsProcessor(target_token) + + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a mixture of requests which do and don't utilize the dummy logitproc +sampling_params_list = [ + SamplingParams(temperature=0.0, extra_args={"target_token": 128}), + SamplingParams(temperature=0.0), + SamplingParams(temperature=0.0, extra_args={"target_token": 67}), + SamplingParams(temperature=0.0), +] + + +def main(): + # Create an LLM. + llm = LLM( + model="facebook/opt-125m", + logits_processors=[WrappedPerReqLogitsProcessor], + ) + # Generate texts from the prompts. + # The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params_list) + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Output: {generated_text!r}") + print("-" * 60) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference/logits_processor/custom_req_init.py b/examples/offline_inference/logits_processor/custom_req_init.py new file mode 100644 index 0000000000000..62947d122e01c --- /dev/null +++ b/examples/offline_inference/logits_processor/custom_req_init.py @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""This example demonstrates a special case of wrapping a request-level logits +processor, namely the case where it is necessary to utilize engine config or +environment info passed to the constructor. The subclass must override the +wrapper base class `__init__()` method to access the engine config, the device +identifier, or the flag which indicates whether pinned memory is available. + +For demo purposes, a request-level dummy logits processor is employed which +causes the same token (`target_token`) to be decoded in each step. The +request-level dummy logits processor is wrapped to create a batch-level logits +processor, which can apply the logits processor to output logits from all +requests in the persistent batch in a given decode step. + +The wrapped dummy logits processor below models a scenario where we must +disable the logits processor on non-"cuda" platforms. The wrapper base class +`__init__()` is overridden in order to check this condition and set a flag. + +A batch is constructed with `temperature=0.0` and 50% of requests specifying +`target_token`, and for these requests - and *only* these requests - we +expect that on a "cuda" device the output will look something like: + +Generated Outputs: +------------------------------------------------------------ +Prompt: 'Hello, my name is' +Output: " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '" +------------------------------------------------------------ +Prompt: 'The president of the United States is' +Output: " not a racist. He is a racist.\nHe's a racist because he" +------------------------------------------------------------ +Prompt: 'The capital of France is' +Output: ' also also also also also also also also also also also also also + also also also' +------------------------------------------------------------ +Prompt: 'The future of AI is' +Output: ' in the hands of the people.\n\nThe future of AI is in the' +------------------------------------------------------------ + +which indicates that the logits processor is running. However, on a non-"cuda" +device, the first and third requests would not repeat the same token. +""" + +from typing import Optional + +import torch + +from vllm import LLM, SamplingParams +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.v1.sample.logits_processor import ( + AdapterLogitsProcessor, + RequestLogitsProcessor, +) + +logger = init_logger(__name__) + + +class DummyPerReqLogitsProcessor: + """The request-level logits processor masks out all logits except the + token id identified by `target_token`""" + + def __init__(self, target_token: int) -> None: + """Specify `target_token`""" + self.target_token = target_token + + def __call__( + self, + output_ids: list[int], + logits: torch.Tensor, + ) -> torch.Tensor: + val_to_keep = logits[self.target_token].item() + logits[:] = float("-inf") + logits[self.target_token] = val_to_keep + return logits + + +class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor): + """Example of overriding the wrapper class `__init__()` in order to utilize + info about the device type""" + + def __init__( + self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool + ): + super().__init__(vllm_config, device, is_pin_memory) + self.is_cuda = device.type == "cuda" + + def is_argmax_invariant(self) -> bool: + return False + + def new_req_logits_processor( + self, + params: SamplingParams, + ) -> Optional[RequestLogitsProcessor]: + """This method returns a new request-level logits processor, customized + to the `target_token` value associated with a particular request. + + Returns None if the logits processor should not be applied to the + particular request. To use the logits processor the request must have + a "target_token" custom argument with an integer value, and the device + must be "cuda"-type + + Args: + params: per-request sampling params + + Returns: + `Callable` request logits processor, or None + """ + if ( + not self.is_cuda + or ( + target_token := params.extra_args + and params.extra_args.get("target_token") + ) + is None + ): + return None + if not isinstance(target_token, int): + logger.warning( + "target_token value %s is not int; not applying logits" + " processor to request.", + target_token, + ) + return None + return DummyPerReqLogitsProcessor(target_token) + + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a mixture of requests which do and don't utilize the dummy logitproc +sampling_params_list = [ + SamplingParams(temperature=0.0, extra_args={"target_token": 128}), + SamplingParams(temperature=0.0), + SamplingParams(temperature=0.0, extra_args={"target_token": 67}), + SamplingParams(temperature=0.0), +] + + +def main(): + # Create an LLM. + llm = LLM( + model="facebook/opt-125m", + logits_processors=[WrappedPerReqLogitsProcessor], + ) + # Generate texts from the prompts. + # The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params_list) + # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Output: {generated_text!r}") + print("-" * 60) + + +if __name__ == "__main__": + main() diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py index a7fde1990f7ed..97d96b129ae90 100644 --- a/tests/v1/logits_processors/test_custom_offline.py +++ b/tests/v1/logits_processors/test_custom_offline.py @@ -15,6 +15,7 @@ from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG, POOLING_MODEL_NAME, TEMP_GREEDY, CustomLogitprocSource, DummyLogitsProcessor, + WrappedPerReqLogitsProcessor, dummy_module) from tests.v1.logits_processors.utils import entry_points as fake_entry_points from tests.v1.logits_processors.utils import prompts @@ -161,6 +162,38 @@ def test_custom_logitsprocs(monkeypatch, _run_test(kwargs, logitproc_loaded=True) +@create_new_process_for_each_test() +def test_custom_logitsprocs_req(monkeypatch): + """Test passing request-level logits processor to offline Python interface + + Wrap a request-level logits processor to create a batch level logits + processor that has a well-defined behavior (mask out all tokens except one + `target_token`) + + Construct an `LLM` instance which loads the wrapped logits processor. Pass + the custom logitproc as a class object. + + Construct a reference `LLM` instance with no custom logitproc + + Pass in a batch of requests, 50% of which pass a `target_token` value + in through `SamplingParams.extra_args`, 50% of which do not. + + Validate that + * Requests which do not activate the custom logitproc, yield the same + results for both `LLM` instances + * Requests which activate the custom logitproc, only output `target_token` + + Args: + monkeypatch: for setting env vars + """ + + # Test that logitproc info is passed to workers + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1") + random.seed(40) + _run_test({"logits_processors": [WrappedPerReqLogitsProcessor]}, + logitproc_loaded=True) + + @create_new_process_for_each_test() @pytest.mark.parametrize("logitproc_source", [ CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT, diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py index c36f1bd021c70..7ec35bd3eb639 100644 --- a/tests/v1/logits_processors/utils.py +++ b/tests/v1/logits_processors/utils.py @@ -3,15 +3,21 @@ import types from enum import Enum, auto -from typing import Optional +from typing import Any, Optional import torch from vllm.config import VllmConfig -from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate, - LogitsProcessor) +from vllm.logger import init_logger +from vllm.sampling_params import SamplingParams +from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, + AdapterLogitsProcessor, + BatchUpdate, LogitsProcessor, + RequestLogitsProcessor) from vllm.v1.sample.logits_processor.builtin import process_dict_updates +logger = init_logger(__name__) + MODEL_NAME = "facebook/opt-125m" POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5" DUMMY_LOGITPROC_ARG = "target_token" @@ -104,5 +110,60 @@ class EntryPoints(list): self.names = [ep.name for ep in eps] +class DummyPerReqLogitsProcessor: + """The request-level logits processor masks out all logits except the + token id identified by `target_token`""" + + def __init__(self, target_token: int) -> None: + """Specify `target_token`""" + self.target_token = target_token + + def __call__( + self, + output_ids: list[int], + logits: torch.Tensor, + ) -> torch.Tensor: + val_to_keep = logits[self.target_token].item() + logits[:] = float("-inf") + logits[self.target_token] = val_to_keep + return logits + + +class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor): + """Example of wrapping a fake request-level logit processor to create a + batch-level logits processor""" + + def is_argmax_invariant(self) -> bool: + return False + + def new_req_logits_processor( + self, + params: SamplingParams, + ) -> Optional[RequestLogitsProcessor]: + """This method returns a new request-level logits processor, customized + to the `target_token` value associated with a particular request. + + Returns None if the logits processor should not be applied to the + particular request. To use the logits processor the request must have + a "target_token" custom argument with an integer value. + + Args: + params: per-request sampling params + + Returns: + `Callable` request logits processor, or None + """ + target_token: Optional[ + Any] = params.extra_args and params.extra_args.get("target_token") + if target_token is None: + return None + if not isinstance(target_token, int): + logger.warning( + "target_token value %s is not int; not applying logits" + " processor to request.", target_token) + return None + return DummyPerReqLogitsProcessor(target_token) + + """Fake version of importlib.metadata.entry_points""" entry_points = lambda group: EntryPoints(group) diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py index 8220269162951..a5f1cadd85241 100644 --- a/vllm/v1/sample/logits_processor/__init__.py +++ b/vllm/v1/sample/logits_processor/__init__.py @@ -1,16 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib +import inspect import itertools +from abc import abstractmethod from collections.abc import Sequence +from functools import partial from typing import TYPE_CHECKING, Optional, Union import torch from vllm.logger import init_logger +from vllm.logits_process import LogitsProcessor as RequestLogitsProcessor +from vllm.sampling_params import SamplingParams from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor, MinPLogitsProcessor, - MinTokensLogitsProcessor) + MinTokensLogitsProcessor, + process_dict_updates) from vllm.v1.sample.logits_processor.interface import (BatchUpdate, LogitsProcessor, MoveDirectionality) @@ -177,9 +183,112 @@ def build_logitsprocs( BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes)) +class AdapterLogitsProcessor(LogitsProcessor): + """Wrapper for per-request logits processors + + To wrap a specific per-request logits processor, + * Subclass `AdapterLogitsProcessor` + * Implement `self.is_argmax_invariant()` base-class method + * Implement `self.new_req_logits_processor(params)` + + `self.__init__(vllm_config, device, is_pin_memory)` does not need to be + overridden in general. However, to implement custom constructor behavior - + especially any logic which operates on or stores `vllm_config`, `device`, + or `is_pin_memory` - `self.__init__(vllm_config, device, is_pin_memory)` + must be overriden and the override must call + `super().__init__(vllm_config, device, is_pin_memory)` + """ + + def __init__(self, vllm_config: "VllmConfig", device: torch.device, + is_pin_memory: bool): + """Subclass must invoke + `super().__init__(vllm_config, device, is_pin_memory)`. + + Subclass constructor may find it useful to utilize the `vllm_config`, + `device` and `is_pin_memory` argument. However regardless of whether + these arguments are used, the vLLM logits processor interface requires + all three arguments to be present. + """ + + # Map req index -> logits processor state + # + # State representation is a partial[Tensor] comprising a request-level + # logits processor with the output token ids argument and (if required) + # the prompt token ids argument pre-populated + # + # Note that the partial carries a *reference* to output token ids, and + # will thus always operate on the list as it is currently, not as it + # was when the partial was created. + self.req_info: dict[int, partial[torch.Tensor]] = {} + + @abstractmethod + def new_req_logits_processor( + self, + params: SamplingParams, + ) -> Optional[RequestLogitsProcessor]: + """Consume request info; return a per-request logits processor. + + Return None if logits processor does not need to be applied to request + + Args: + params: request sampling params + + Returns: + None if logits processor should not be applied to request; otherwise + returns a `RequestLogitsProcessor` instance + + """ + raise NotImplementedError + + def _new_state( + self, + params: SamplingParams, + prompt_ids: list[int], + output_ids: list[int], + ) -> Optional[partial[torch.Tensor]]: + """Return state representation for new request + + Returns None if logits processor is not applicable to request + + Args: + params: request sampling params + prompt_ids: request prompt token ids + output_ids: decoded tokens so far for this request + + Returns: + logits processor partial[Tensor] or None + + """ + if req_lp := self.new_req_logits_processor(params): + args = [prompt_ids, output_ids] if (len( + inspect.signature(req_lp).parameters) == 3) else [output_ids] + return partial(req_lp, *args) + return None + + def update_state(self, batch_update: Optional[BatchUpdate]): + process_dict_updates( + self.req_info, + batch_update, + self._new_state, + ) + + def apply(self, logits: torch.Tensor) -> torch.Tensor: + if self.req_info: + # Apply per-request logits processors to corresponding rows of + # logits tensor + for req_idx, req_lp in self.req_info.items(): + req_logits = logits[req_idx] + new_logits = req_lp(req_logits) + if new_logits is not req_logits: + # Modify logits tensor row in-place if necessary + logits[req_idx] = new_logits + return logits + + __all__ = [ "LogitsProcessor", "LogitBiasLogitsProcessor", "MinPLogitsProcessor", "MinTokensLogitsProcessor", "BatchUpdate", "BatchUpdateBuilder", "MoveDirectionality", "LogitsProcessors", "build_logitsprocs", - "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP" + "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP", + "AdapterLogitsProcessor" ] From 1bd007f23476d98caeb0a62c00384d7f2cf052a6 Mon Sep 17 00:00:00 2001 From: co63oc Date: Wed, 3 Sep 2025 11:44:50 +0800 Subject: [PATCH 44/95] fix some typos (#24071) Signed-off-by: co63oc --- benchmarks/benchmark_block_pool.py | 2 +- benchmarks/benchmark_ngram_proposer.py | 2 +- csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu | 2 +- docs/configuration/optimization.md | 4 ++-- docs/design/io_processor_plugins.md | 2 +- .../prithvi_geospatial_mae_io_processor.py | 2 +- examples/online_serving/prithvi_geospatial_mae.py | 2 +- tests/compile/piecewise/test_multiple_graphs.py | 2 +- tests/kernels/moe/test_mxfp4_moe.py | 2 +- tests/models/multimodal/processing/test_mllama4.py | 2 +- tests/quantization/test_modelopt.py | 2 +- tests/samplers/test_beam_search.py | 2 +- tests/v1/attention/test_chunked_local_attention.py | 2 +- .../unit/test_shared_storage_connector.py | 14 +++++++------- tests/v1/logits_processors/test_custom_offline.py | 2 +- vllm/benchmarks/serve.py | 2 +- vllm/config/compilation.py | 2 +- vllm/config/parallel.py | 2 +- .../kv_transfer/kv_connector/v1/nixl_connector.py | 2 +- vllm/entrypoints/openai/serving_responses.py | 2 +- vllm/model_executor/layers/activation.py | 2 +- .../compressed_tensors/transform/module.py | 2 +- vllm/model_executor/layers/quantization/mxfp4.py | 2 +- vllm/model_executor/models/gemma3n_mm.py | 2 +- vllm/model_executor/models/interns1.py | 2 +- vllm/third_party/pynvml.py | 2 +- vllm/v1/attention/backends/flash_attn.py | 2 +- vllm/v1/attention/backends/flashinfer.py | 2 +- vllm/v1/core/kv_cache_utils.py | 2 +- vllm/v1/worker/gpu_input_batch.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/kv_connector_model_runner_mixin.py | 2 +- 32 files changed, 39 insertions(+), 39 deletions(-) diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py index fd363c2ad0514..eae8d9927ea39 100644 --- a/benchmarks/benchmark_block_pool.py +++ b/benchmarks/benchmark_block_pool.py @@ -57,7 +57,7 @@ def invoke_main() -> None: "--num-iteration", type=int, default=1000, - help="Number of iterations to run to stablize final data readings", + help="Number of iterations to run to stabilize final data readings", ) parser.add_argument( "--allocate-blocks", diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index c60040d05ab7a..11833fa1b3c8b 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -77,7 +77,7 @@ def invoke_main() -> None: "--num-iteration", type=int, default=100, - help="Number of iterations to run to stablize final data readings", + help="Number of iterations to run to stabilize final data readings", ) parser.add_argument( "--num-req", type=int, default=128, help="Number of requests in the batch" diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu index fdac47c425d61..d7efb717a9a76 100644 --- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu +++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu @@ -181,7 +181,7 @@ struct W4A8GemmKernel { auto A_ptr = static_cast(A.const_data_ptr()); auto B_ptr = static_cast(B.const_data_ptr()); auto D_ptr = static_cast(D.data_ptr()); - // can we avoid harcode the 8 here + // can we avoid hardcode the 8 here auto S_ptr = static_cast const*>( group_scales.const_data_ptr()); diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 0ab2ae58ad861..c853fcf92941e 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2 !!! note API server scale-out disables [multi-modal IPC caching](#ipc-caching) - because it requires a one-to-one correspondance between API and engine core processes. + because it requires a one-to-one correspondence between API and engine core processes. This does not impact [multi-modal processor caching](#processor-caching). @@ -227,7 +227,7 @@ to avoid repeatedly processing the same multi-modal inputs in `BaseMultiModalPro ### IPC Caching Multi-modal IPC caching is automatically enabled when -there is a one-to-one correspondance between API (`P0`) and engine core (`P1`) processes, +there is a one-to-one correspondence between API (`P0`) and engine core (`P1`) processes, to avoid repeatedly transferring the same multi-modal inputs between them. ### Configuration diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index ee474b5a7b997..e70ee4a076e54 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -2,7 +2,7 @@ IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output. -When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggerd via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint. +When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint. ## Writing an IO Processor Plugin diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py index 8023cd6677762..adc27859a1cdd 100644 --- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py +++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py @@ -12,7 +12,7 @@ from vllm.pooling_params import PoolingParams # multimodal data. In this specific case this example will take a geotiff # image as input, process it using the multimodal data processor, and # perform inference. -# Reuirement - install plugin at: +# Requirement - install plugin at: # https://github.com/christian-pinto/prithvi_io_processor_plugin diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/prithvi_geospatial_mae.py index 31301e0042cf4..359162c470f08 100644 --- a/examples/online_serving/prithvi_geospatial_mae.py +++ b/examples/online_serving/prithvi_geospatial_mae.py @@ -10,7 +10,7 @@ import requests # multimodal data. In this specific case this example will take a geotiff # image as input, process it using the multimodal data processor, and # perform inference. -# Reuirements : +# Requirements : # - install plugin at: # https://github.com/christian-pinto/prithvi_io_processor_plugin # - start vllm in serving mode with the below args diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index f5e2d9ddb7528..aee2acbd490ee 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -134,7 +134,7 @@ class SimpleModelWithTwoGraphs(ParentModel): # Test will fail without set_model_tag here with error: # "ValueError: too many values to unpack (expected 3)" # This is because CompiledAttention and CompiledAttentionTwo - # have different implmentations but the same torch.compile + # have different implementations but the same torch.compile # cache dir will be used as default prefix is 'model_tag' with set_model_tag("attn_one"): self.attn_one = CompiledAttention( diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py index 7bd1ffce58e96..c29bed3dd6b32 100644 --- a/tests/kernels/moe/test_mxfp4_moe.py +++ b/tests/kernels/moe/test_mxfp4_moe.py @@ -224,7 +224,7 @@ def tg_mxfp4_moe( assert (w2_bias.dim() == 2 and w2_bias.shape[0] == num_experts and w2_bias.shape[1] == hidden_size) - # Swap w1 and w3 as the defenition of + # Swap w1 and w3 as the definition of # swiglu is different in the trtllm-gen w13_weight_scale_ = w13_weight_scale.clone() w13_weight_ = w13_weight.clone() diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index 3be77b5da63f2..e7b28ff8ec7f0 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -52,7 +52,7 @@ def test_profiling(model_id: str, max_model_len: int): chunks_per_image = prod(mm_data["patches_per_image"]) total_num_patches = chunks_per_image * tokens_per_patch num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][ - 1] # x-y seperator tokens + 1] # x-y separator tokens total_tokens = total_num_patches.item() + num_tiles.item( ) + 3 # image start, image, image end diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py index fcbfa681d75c9..c60a03f44baec 100644 --- a/tests/quantization/test_modelopt.py +++ b/tests/quantization/test_modelopt.py @@ -27,7 +27,7 @@ def use_v0_only(monkeypatch): reason="ModelOpt FP8 is not supported on this GPU type.") def test_modelopt_fp8_checkpoint_setup(vllm_runner): """Test ModelOpt FP8 checkpoint loading and structure validation.""" - # TODO: provide a small publically available test checkpoint + # TODO: provide a small publicly available test checkpoint model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/" "TinyLlama-1.1B-Chat-v1.0-fp8-0710") diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index cc9a88a255f9f..0320a5ef31a65 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -82,7 +82,7 @@ def test_beam_search_with_concurrency_limit( beam_width: int, ) -> None: # example_prompts[1]&[3]&[7] fails due to unknown reason even without - # concurency limit. skip them for now. + # concurrency limit. skip them for now. example_prompts = (example_prompts[:8]) concurrency_limit = 2 assert len(example_prompts) > concurrency_limit diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py index 8c5a63653db9f..be77256a0d2f0 100644 --- a/tests/v1/attention/test_chunked_local_attention.py +++ b/tests/v1/attention/test_chunked_local_attention.py @@ -160,7 +160,7 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData): # Use torch.arange instead of torch.randint so we can assert on # block table tensor values. The block table will have shape # (num_batches, cdiv(max_seq_len, block_size)) and the values will be - # aranged from 0 to cdiv(max_seq_len, block_size)-1 + # arranged from 0 to cdiv(max_seq_len, block_size)-1 arange_block_indices=True, ) diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py index db203b81f15fc..6be261e45cb00 100644 --- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py +++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py @@ -33,7 +33,7 @@ def _check_path_len(path): def _list_path(path): - """Return the list of foldername (hashes generatd) under the path""" + """Return the list of foldername (hashes generated) under the path""" return list(path.iterdir()) @@ -41,7 +41,7 @@ def run_test(tmp_path, processor, llm: LLM, question: str, image_urls: list[Image], expected_len: int, info: str): """ One individual test to process the prompt and output base on 1 set of input - Then check if the length in the strorage path matches the expected length + Then check if the length in the storage path matches the expected length `info` introduces details or purpose of the individual test """ print(f"***info: {info}***") @@ -115,7 +115,7 @@ def test_shared_storage_connector_hashes(tmp_path): """ Tests that SharedStorageConnector saves KV to the storage locations with proper hashes; that are unique for inputs with identical text but - differnt images (same size), or same multiple images but different orders. + different images (same size), or same multiple images but different orders. """ # Using tmp_path as the storage path to store KV print(f"KV storage path at: {str(tmp_path)}") @@ -171,12 +171,12 @@ def test_shared_storage_connector_hashes(tmp_path): img=[image_1], expected_len=2, info=("image_1 single input the 2nd time. " - "It should not form aother new hash.")), + "It should not form another new hash.")), InputCase(text=TEXT_PROMPTS[0], img=[image_2], expected_len=2, info=("image_2 single input the 2nd time. " - "It should not form aother new hash.")), + "It should not form another new hash.")), InputCase(text=TEXT_PROMPTS[0], img=[image_1, image_2], expected_len=3, @@ -189,12 +189,12 @@ def test_shared_storage_connector_hashes(tmp_path): img=[image_1, image_2], expected_len=4, info=("[image_1, image_2] input the 2nd time. " - "It should not form aother new hash.")), + "It should not form another new hash.")), InputCase(text=TEXT_PROMPTS[0], img=[image_2, image_1], expected_len=4, info=("[image_2, image_1] input the 2nd time. " - "It should not form aother new hash.")), + "It should not form another new hash.")), InputCase(text=TEXT_PROMPTS[0], img=[], expected_len=5, diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py index 97d96b129ae90..891f55a14633b 100644 --- a/tests/v1/logits_processors/test_custom_offline.py +++ b/tests/v1/logits_processors/test_custom_offline.py @@ -81,7 +81,7 @@ def _run_test(kwargs: dict, logitproc_loaded: bool) -> None: target_token = params.extra_args[DUMMY_LOGITPROC_ARG] if not all(x == target_token for x in lp_toks): raise AssertionError( - f"Request {bdx} generated {lp_toks}, shoud all be " + f"Request {bdx} generated {lp_toks}, should all be " f"{target_token}") else: # This request does not exercise custom logitproc (or custom diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index abb838316cd31..a98eb2a78f103 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -189,7 +189,7 @@ async def get_request( # NOTE: If we simply accumulate the random delta values # from the gamma distribution, their sum would have 1-2% gap # from target_total_delay_s. The purpose of the following logic is to - # close the gap for stablizing the throughput data + # close the gap for stabilizing the throughput data # from different random seeds. target_total_delay_s = total_requests / request_rate normalize_factor = target_total_delay_s / delay_ts[-1] diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 5c3b220016360..28ad3d2f535d3 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -234,7 +234,7 @@ class CompilationConfig: - FULL_AND_PIECEWISE. PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph - incompatiable ops (i.e. some attention ops) outside the cudagraph + incompatible ops (i.e. some attention ops) outside the cudagraph for general flexibility. This is the default mode. diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 9ea883d4a03cd..9d4594bab3c17 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -87,7 +87,7 @@ class ParallelConfig: data_parallel_external_lb: bool = False """Whether to use "external" DP LB mode. Applies only to online serving and when data_parallel_size > 0. This is useful for a "one-pod-per-rank" - wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank + wide-EP setup in Kubernetes. Set implicitly when --data-parallel-rank is provided explicitly to vllm serve.""" data_parallel_hybrid_lb: bool = False """Whether to use "hybrid" DP LB mode. Applies only to online serving diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 6608d2a4a9e09..efe023d5595e5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -787,7 +787,7 @@ class NixlConnectorWorker: self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist( "NIXL_INIT_AGENT", descs) - # TODO(mgoin): Hybrid memory allocator is currently diabled for + # TODO(mgoin): Hybrid memory allocator is currently disabled for # models with local attention (Llama 4). Can remove this once enabled. if self.vllm_config.model_config.hf_config.model_type == "llama4": from transformers import Llama4TextConfig diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 4c15de3030998..7f11b37e51728 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -717,7 +717,7 @@ class OpenAIServingResponses(OpenAIServing): prev_msgs.append(msg) messages.extend(prev_msgs) # Append the new input. - # Reponses API supports simple text inputs without chat format. + # Responses API supports simple text inputs without chat format. if isinstance(request.input, str): messages.append(get_user_message(request.input)) else: diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index eb7e494e32861..fac37ef75b638 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -362,7 +362,7 @@ class ReLUSquaredActivation(CustomOp): return torch.square(F.relu(x)) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - #TODO : implement cuda kenrels + #TODO : implement cuda kernels return self.forward_native(x) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py index b3be254717734..48ab2582a3b26 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py @@ -83,7 +83,7 @@ class HadamardTransform(torch.nn.Module): # do not fold into weight in order to utilize FWHT self.scales[part_id] = 1 / math.sqrt(data.size(0)) - # FUTURE: avoid runtime tranpose by processing weights + # FUTURE: avoid runtime transpose by processing weights # prior to apply def forward(self, value: Tensor, part_id: int = 0) -> Tensor: diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index a2301779c77e4..85d05ff51daa1 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -310,7 +310,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): w13_bias = layer.w13_bias.data.to(torch.float32) w2_bias = layer.w2_bias.data.to(torch.float32) - # Swap w1 and w3 as the defenition of + # Swap w1 and w3 as the definition of # swiglu is different in the trtllm-gen def swap_every_two_rows(x, axis=-1): shape = x.shape diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index d831e9084db57..3074451e40a4d 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -179,7 +179,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo] ) -> BatchFeature: # HF Transformers audio processor no longer accepts `audios` key. - # We pop `audios` and replace it with `audio` key to surpress + # We pop `audios` and replace it with `audio` key to suppress # the warning. if 'audios' in mm_data: mm_data['audio'] = mm_data.pop('audios') diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index 26e358f9394c6..d998b8a0ab4f7 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -492,7 +492,7 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: - # transformers InternVLProcessor uses as the seperator + # transformers InternVLProcessor uses as the separator # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116 if modality.startswith("image"): return '' diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py index c06aa567444d8..6aabbc217dd03 100644 --- a/vllm/third_party/pynvml.py +++ b/vllm/third_party/pynvml.py @@ -3533,7 +3533,7 @@ def nvmlDeviceGetMPSComputeRunningProcesses_v3(handle): return [] elif (ret == NVML_ERROR_INSUFFICIENT_SIZE): # typical case - # oversize the array incase more processes are created + # oversize the array in case more processes are created c_count.value = c_count.value * 2 + 5 proc_array = c_nvmlProcessInfo_v3_t * c_count.value c_procs = proc_array() diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index dd2b956d4fa3d..3cc67acd04c6b 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -167,7 +167,7 @@ class FlashAttentionMetadataBuilder( # work for mixed prefill-decode and uniform-decode. But for non-spec decodes # the graphs would not work for mixed prefill-decode; sorta the inverse # of UNIFORM_SINGLE_TOKEN_DECODE. - # Theres probably a better way to describe this using `AttentionCGSupport` + # There's probably a better way to describe this using `AttentionCGSupport` # but for now just set it to `UNIFORM_BATCH` to get use to drop down # to FULL_AND_PIECEWISE. # TODO(luka, lucas): audit FA2 as part of: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 5fc3a1517b690..2f275b8b23b17 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -291,7 +291,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): paged_kv_indices_buffer=paged_kv_indices, paged_kv_last_page_len_buffer=paged_kv_last_page_len, # Tensor cores are enabled by default because the perf would be - # atleast as good as cuda cores for all attention ops in latest + # at least as good as cuda cores for all attention ops in latest # gpus. use_tensor_cores=True, ) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 590baa6208d07..248ad9cda7c28 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -217,7 +217,7 @@ class FreeKVCacheBlockQueue: # Create a fake head and a tail block for the doubly linked list to # reduce branching in the code # - # The implementation garenteed that the fake head and tail + # The implementation guaranteed that the fake head and tail # are NEVER got popped, so we could safely assume each real blocks # in the queue has prev and next blocks. self.fake_free_list_head = KVCacheBlock(block_id=-1) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index ef5a7e39a5b16..ad70d9efaaaac 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -584,7 +584,7 @@ class InputBatch: if self.is_pooling_model: last_req_index -= 1 - # Samping state not used by pooling models. + # Sampling state not used by pooling models. continue # Autoregressive models require detailed tracking of condense diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c81bc58f1ef46..4556a51b809d8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2776,7 +2776,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.attn_groups.append( create_attn_groups(attn_backends, kv_cache_spec)) - # Calculate reorder batch threshold (if neeeded) + # Calculate reorder batch threshold (if needed) self.calculate_reorder_batch_threshold() def initialize_cudagraph_capture(self) -> None: diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index a03ebe35d8e0a..e2ffa2f12fda5 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -82,7 +82,7 @@ class KVConnectorModelRunnerMixin: scheduler_output) if has_kv_transfer_group() else nullcontext() # This context manager must be used within an active forward context. - # It encapsulates the entire KV conector lifecycle within execute_model + # It encapsulates the entire KV connector lifecycle within execute_model @staticmethod @contextmanager def _get_kv_connector_output( From c4ed78b14f7f63cfa65722ad21deffe964441fd2 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 2 Sep 2025 23:45:52 -0400 Subject: [PATCH 45/95] [Compile] Fix Compile Warning for `w4a8_mm_entry.cu` (#23660) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: yewentao256 Co-authored-by: Luka Govedič --- csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu index d7efb717a9a76..57bcbaae45dda 100644 --- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu +++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu @@ -11,6 +11,7 @@ #include "core/registration.h" #include "cutlass/cutlass.h" +#include #include "cute/tensor.hpp" #include "cutlass/gemm/collective/collective_builder.hpp" @@ -169,6 +170,11 @@ struct W4A8GemmKernel { int k = A.size(1); int n = B.size(1); + // safely cast group_size to int + TORCH_CHECK(group_size > 0 && group_size <= std::numeric_limits::max(), + "group_size out of supported range for int: ", group_size); + int const group_size_int = static_cast(group_size); + // Allocate output const at::cuda::OptionalCUDAGuard device_guard(device_of(A)); auto device = A.device(); @@ -192,7 +198,7 @@ struct W4A8GemmKernel { cute::tile_to_shape(LayoutAtomQuant{}, shape_B); // strides - int const scale_k = cutlass::ceil_div(k, group_size); + int const scale_k = cutlass::ceil_div(k, group_size_int); StrideA stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1)); // Reverse stride here due to swap and transpose @@ -211,8 +217,8 @@ struct W4A8GemmKernel { using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments; MainloopArguments mainloop_arguments{ - B_ptr, layout_B_reordered, A_ptr, stride_A, - S_ptr, stride_S, group_size}; + B_ptr, layout_B_reordered, A_ptr, stride_A, + S_ptr, stride_S, group_size_int}; EpilogueArguments epilogue_arguments{ ChTokScalesEpilogue::prepare_args(channel_scales, token_scales), From d7e1e599724ea82e12f40bd2b9320b5c27b23a32 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Wed, 3 Sep 2025 06:05:45 +0200 Subject: [PATCH 46/95] [Doc]: fix typos in Python comments (#24093) Signed-off-by: Didier Durand --- tests/core/test_scheduler.py | 2 +- .../correctness/test_transcription_api_correctness.py | 2 +- tests/entrypoints/openai/test_return_token_ids.py | 2 +- tests/entrypoints/openai/test_serving_chat.py | 2 +- tests/kernels/utils.py | 2 +- tests/multimodal/test_utils.py | 4 ++-- tests/v1/e2e/test_spec_decode.py | 2 +- .../kv_connector/unit/test_remote_decode_lifecycle.py | 4 ++-- tests/v1/spec_decode/test_tree_attention.py | 4 ++-- vllm/lora/utils.py | 2 +- .../layers/quantization/compressed_tensors/utils.py | 2 +- vllm/multimodal/utils.py | 10 +++++----- vllm/v1/attention/backends/utils.py | 2 +- vllm/v1/structured_output/utils.py | 4 ++-- vllm/v1/worker/tpu_worker.py | 2 +- 15 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 591e1780c11c6..e1a840bb15039 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -641,7 +641,7 @@ def test_schedule_decode_blocks_to_copy_update(): # Nothing is preempted. assert output.blocks_to_swap_out == [] # Since append_slot returns the source -> dist mapping, it should - # applied. + # be applied. assert output.blocks_to_copy == [(2, 3)] diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py index 0d0ce0be8c5f8..9122b7003bf9a 100644 --- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py +++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py @@ -32,7 +32,7 @@ def to_bytes(y, sr): async def transcribe_audio(client, tokenizer, y, sr): # Send loaded audio directly instead of loading from disk, - # dont account for that time though + # don't account for that time though with to_bytes(y, sr) as f: start_time = time.perf_counter() transcription = await client.audio.transcriptions.create( diff --git a/tests/entrypoints/openai/test_return_token_ids.py b/tests/entrypoints/openai/test_return_token_ids.py index 6addcb41c4098..ff8f193fec552 100644 --- a/tests/entrypoints/openai/test_return_token_ids.py +++ b/tests/entrypoints/openai/test_return_token_ids.py @@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server): logprobs_token_ids.append(token_id) # When echo=True, the logprobs include both prompt and response tokens - # The token_ids field should match the the suffix of response portion + # The token_ids field should match the suffix of response portion # The prompt_token_ids should match the prompt portion assert len(completion.choices[0].token_ids) < len(logprobs_token_ids) response_token_ids_length = len(completion.choices[0].token_ids) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 10879f0be83c8..fe482112d386b 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -313,7 +313,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type): }], ) - # By default cache_salt in the engine prompt is not set + # By default, cache_salt in the engine prompt is not set with suppress(Exception): await serving_chat.create_chat_completion(req) assert "cache_salt" not in mock_engine.generate.call_args.args[0] diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index fa4125840a010..c46db8e307936 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -1236,7 +1236,7 @@ def baseline_scaled_mm(a: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: # We treat N-dimensional group scaling as extended numpy-style broadcasting - # in numpy simply stretches dimensions with an extent of 1 to match the + # in numpy simply stretches dimensions with an extent of 1 to match # the target shape by repeating the data along that dimension (broadcasting) # , we extend these semantics to say if the extent of a dimension in the # source shape is not 1 and does not match the target shape we repeat each diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 05e68a961a548..0f82e1f3e343e 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -458,7 +458,7 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int, with torch.inference_mode(): sharded_output = run_dp_sharded_vision_model(image_input, vision_model) - # Check that the world size is setup correctly + # Check that the world size is set up correctly assert get_tensor_model_parallel_world_size() == world_size # Check that the outputs have the same shape @@ -642,7 +642,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int, rope_type="rope_3d") sharded_output = torch.cat(sharded_output, dim=0) - # Check that the world size is setup correctly + # Check that the world size is set up correctly assert get_tensor_model_parallel_world_size() == world_size # Compare outputs (only on rank 0) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index bd0fa6b80781a..cd1d34fc6c3ec 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -83,7 +83,7 @@ def test_ngram_correctness( model_name: str, ): ''' - Compare the outputs of a original LLM and a speculative LLM + Compare the outputs of an original LLM and a speculative LLM should be the same when using ngram speculative decoding. ''' with monkeypatch.context() as m: diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py index d8c56ac42f718..380e72a156336 100644 --- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py @@ -42,7 +42,7 @@ def test_basic_lifecycle(): engine_core_outputs = scheduler.update_from_output(scheduler_output, model_runner_output) - # Ensure the request is finished after 1 tokens. + # Ensure the request is finished after 1 token. assert request.is_finished() assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED output = engine_core_outputs[0].outputs[0] @@ -141,7 +141,7 @@ def test_short_prompt_lifecycle(): def test_prefix_cache_lifecycle(): - """Test that remote decode params still works with a prefix cache hit.""" + """Test that remote decode params still work with a prefix cache hit.""" vllm_config = create_vllm_config() scheduler = create_scheduler(vllm_config) diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py index 6317817408661..eacb2ad584baf 100644 --- a/tests/v1/spec_decode/test_tree_attention.py +++ b/tests/v1/spec_decode/test_tree_attention.py @@ -187,7 +187,7 @@ def test_tree_attn_correctness() -> None: dtype=torch.bfloat16, ) - # Setup the block table and KV cache for paged KV. + # Set up the block table and KV cache for paged KV. assert max_sequence_length % block_size == 0 max_blocks_per_batch = max_sequence_length // block_size kv_cache = torch.randn( @@ -222,7 +222,7 @@ def test_tree_attn_correctness() -> None: num_alloc_blocks_per_batch] = block_ids.view( -1, num_alloc_blocks_per_batch) - # Setup the slot mapping for the input KVs. + # Set up the slot mapping for the input KVs. tree_positions = sequence_position + torch.arange( 0, tree_size_q, diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index ab0a9fbd255de..1fc214c12b5d1 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -239,7 +239,7 @@ def get_adapter_absolute_path(lora_path: str) -> str: except (HfHubHTTPError, RepositoryNotFoundError, EntryNotFoundError, HFValidationError): # Handle errors that may occur during the download - # Return original path instead instead of throwing error here + # Return original path instead of throwing error here logger.exception("Error downloading the HuggingFace model") return lora_path diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 099d8613fc1a7..b2dd2501095f8 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -94,7 +94,7 @@ def find_matched_target( config that a layer corresponds to. Recall that a compressed-tensors configs has a concept of - config_groups, where each layer can be quantized with with a different + config_groups, where each layer can be quantized with a different scheme. targets in each config_group will be a list of either layer names diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index ac967dcc4003e..794e24c2c748c 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -213,7 +213,7 @@ class MediaConnector: image_mode: str = "RGB", ) -> Image.Image: """ - Load a PIL image from a HTTP or base64 data URL. + Load a PIL image from an HTTP or base64 data URL. By default, the image is converted into RGB format. """ @@ -237,7 +237,7 @@ class MediaConnector: image_mode: str = "RGB", ) -> Image.Image: """ - Asynchronously load a PIL image from a HTTP or base64 data URL. + Asynchronously load a PIL image from an HTTP or base64 data URL. By default, the image is converted into RGB format. """ @@ -261,7 +261,7 @@ class MediaConnector: image_mode: str = "RGB", ) -> tuple[npt.NDArray, dict[str, Any]]: """ - Load video from a HTTP or base64 data URL. + Load video from an HTTP or base64 data URL. """ image_io = ImageMediaIO(image_mode=image_mode, **self.media_io_kwargs.get("image", {})) @@ -281,7 +281,7 @@ class MediaConnector: image_mode: str = "RGB", ) -> tuple[npt.NDArray, dict[str, Any]]: """ - Asynchronously load video from a HTTP or base64 data URL. + Asynchronously load video from an HTTP or base64 data URL. By default, the image is converted into RGB format. """ @@ -370,7 +370,7 @@ def group_mm_inputs_by_modality( def modality_group_func( mm_input: MultiModalKwargsItems) -> Union[str, int]: - # If the input has multiple modalities, return a id as the unique key + # If the input has multiple modalities, return an id as the unique key # for the mm_input input. if len(mm_input) > 1: return id(mm_input) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 011a90ece01bd..b286a4ba9fe54 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -709,7 +709,7 @@ def reorder_batch_to_split_decodes_and_prefills( for i, req_id in enumerate(input_batch.req_ids): num_tokens = scheduler_output.num_scheduled_tokens[req_id] - # for now treat 1 scheduled token as "decode" even if its not, + # for now treat 1 scheduled token as "decode" even if it's not, # we should update this to something like < 8 in the future but # currently the TritonMLA._forward_decode only supports # num_tokens = 1 diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 95319831d5121..953185a8fc31d 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -65,9 +65,9 @@ def get_outlines_cache_path() -> str: elif xdg_cache_home: return os.path.join(xdg_cache_home, ".cache", "outlines") # If homedir is "/", we may be inside a container, and thus writing to - # root would be problematic, so we fallback to using a tempfile. + # root would be problematic, so we fall back to using a tempfile. # Also validate the path exists, since os.path.expanduser does - # not garuntee existence. + # not guarantee existence. elif os.path.isdir(home_dir) and home_dir != "/": # Default Unix fallback: ~/.cache/outlines return os.path.join(home_dir, ".cache", "outlines") diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 9adf8a14213f3..3f4e3ecbd4e26 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -250,7 +250,7 @@ class TPUWorker: scheduler_output: "SchedulerOutput", ) -> Optional[ModelRunnerOutput]: output = self.model_runner.execute_model(scheduler_output) - # every worker's output is needed when kv_transfer_group is setup + # every worker's output is needed when kv_transfer_group is set up return output if self.is_driver_worker or has_kv_transfer_group( ) else None From 02d411fdb232f31ce46c6f8076bfe9e5acf88fc9 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Wed, 3 Sep 2025 06:14:07 +0200 Subject: [PATCH 47/95] [Doc]: fix typos in Python comments (#24115) Signed-off-by: Didier Durand --- .buildkite/nightly-benchmarks/scripts/compare-json-results.py | 2 +- benchmarks/benchmark_serving.py | 2 +- benchmarks/benchmark_serving_structured_output.py | 2 +- benchmarks/benchmark_throughput.py | 2 +- tools/profiler/visualize_layerwise_profile.py | 2 +- vllm/compilation/collective_fusion.py | 2 +- vllm/engine/multiprocessing/engine.py | 2 +- vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 2 +- vllm/model_executor/model_loader/default_loader.py | 2 +- vllm/v1/worker/xpu_worker.py | 2 +- vllm/worker/worker.py | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 50431d0cd4c5e..5ea5a50a258a4 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -218,7 +218,7 @@ if __name__ == "__main__": "--xaxis", type=str, default="# of max concurrency.", - help="column name to use as X Axis in comparision graph", + help="column name to use as X Axis in comparison graph", ) args = parser.parse_args() diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 02f5f585c0c16..934df05efac17 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1104,7 +1104,7 @@ def create_argument_parser(): "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-separated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentiles. " "This argument specifies the metrics to report percentiles. " 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' 'Default value is "ttft,tpot,itl".', diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index ca6843a72aa36..4aae755eb4e44 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -998,7 +998,7 @@ def create_argument_parser(): "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-separated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentiles. " "This argument specifies the metrics to report percentiles. " 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' 'Default value is "ttft,tpot,itl".', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 6b24b8c8f3c67..34a525f00d910 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -719,7 +719,7 @@ def create_argument_parser(): "[length * (1 - range_ratio), length * (1 + range_ratio)].", ) - # hf dtaset + # hf dataset parser.add_argument( "--hf-subset", type=str, default=None, help="Subset of the HF dataset." ) diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index 038d3c44f043a..30d6547073d38 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -119,7 +119,7 @@ def attempt_to_make_names_unique(entries_and_traces): if not all_the_same(trace_eles)), None) if first_trace_difference is None: - # can't create a unique name, leave them names as the + # can't create a unique name, leave the names as they # are they will get aggregated by the pivot_table call continue diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 7a99aaff707dc..71274420c3426 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -513,7 +513,7 @@ if flashinfer_comm is not None: torch.ops._C.static_scaled_fp8_quant( quant_out, norm_out, scale_factor) if scale_factor is None or norm_out is not None: - # we need to return allreduce outpput + # we need to return allreduce output # in cases of non quant fused AR + RMS norm # and fused AR + RMS norm + quant without fused add allreduce_in.copy_(allreduce_out) diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 343b8df7e87bd..138283d4c8a75 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -49,7 +49,7 @@ class MQLLMEngine: This class is used to wrap the [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use - in concurrnet manner. It runs a background loop and uses zeromq to + in concurrent manner. It runs a background loop and uses zeromq to receive new requests and stream outputs incrementally via ipc. The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 5333bbd310ff9..ecdcc573935c0 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -23,7 +23,7 @@ TORCH_DEVICE_IDENTITY = None # The condition to determine if it is on a platform that supports # torch._scaled_mm rowwise feature. # The condition is determined once as the operations -# are time consuming. +# are time-consuming. USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm() and version.parse( torch.__version__) >= version.parse("2.7") and current_platform.has_device_capability(94)) diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 34b8d8e4ed622..1e5aa9e571edb 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -211,7 +211,7 @@ class DefaultModelLoader(BaseModelLoader): if not USE_TPU_COMMONS: # In PyTorch XLA, we should call `xm.mark_step` - # requently so that not too many ops are accumulated + # frequently so that not too many ops are accumulated # in the XLA program. import torch_xla.core.xla_model # as xm import torch_xla.core.xla_model as xm diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 17288cda8eccf..7355206f30f57 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -84,7 +84,7 @@ class XPUWorker(Worker): """Profiles the peak memory usage of the model to determine how many KV blocks may be allocated without OOMs. The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks + Then, it calculates the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory. .. tip:: You may limit the usage of GPU memory diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 2d2e51c329e74..08bb4e7c9e479 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -234,7 +234,7 @@ class Worker(LocalOrDistributedWorkerBase): KV blocks may be allocated without OOMs. The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks + Then, it calculates the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory. Tip: From e81d4e69c16c05c147b692a9f028265bd0d49092 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Wed, 3 Sep 2025 12:19:14 +0800 Subject: [PATCH 48/95] [Misc] Add check for dual_chunk_attention (#24070) Signed-off-by: zjy0516 --- vllm/config/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 2e0212d010da0..fd3ad2c8a6d6a 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -49,7 +49,8 @@ from vllm.transformers_utils.config import ( try_get_tokenizer_config, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect -from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType, +from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, + STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType, LazyLoader, common_broadcastable_dtype, random_uuid) if TYPE_CHECKING: @@ -1304,6 +1305,10 @@ class ModelConfig: self.hf_config.dual_chunk_attention_config[ "sparse_attention_enabled"] = True + if envs.VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL: + raise ValueError("please set VLLM_ATTENTION_BACKEND to " + f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}") + def verify_async_output_proc(self, parallel_config, speculative_config, device_config) -> None: if not self.use_async_output_proc: From 426cc8629f7e630e1c5a0b96fe2db737a170a06d Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Tue, 2 Sep 2025 21:57:59 -0700 Subject: [PATCH 49/95] [BugFix] Fix routed_scaling_factor double mul for dots1 and glm4 MoE models (#24132) Signed-off-by: Yong Hoon Shin --- vllm/model_executor/models/dots1.py | 3 ++- vllm/model_executor/models/glm4_moe.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index a5477af8694b4..4ddf906dddefe 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -137,7 +137,8 @@ class Dots1MoE(nn.Module): topk_group=config.topk_group, prefix=f"{prefix}.experts", scoring_func=config.scoring_func, - routed_scaling_factor=self.routed_scaling_factor, + # we do scaling outside, set factor to 1.0 to avoid double mul + routed_scaling_factor=1.0, e_score_correction_bias=self.gate.e_score_correction_bias) if config.n_shared_experts is not None: diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 06ed453ec29f9..284506b642d66 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -159,7 +159,8 @@ class Glm4MoE(nn.Module): topk_group=config.topk_group, prefix=f"{prefix}.experts", scoring_func="sigmoid", - routed_scaling_factor=self.routed_scaling_factor, + # we do scaling outside, set factor to 1.0 to avoid double mul + routed_scaling_factor=1.0, e_score_correction_bias=self.gate.e_score_correction_bias, enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts) From f38035c123b32f239f746585e197e7250694a1ca Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 3 Sep 2025 14:45:25 +0800 Subject: [PATCH 50/95] [distributed][rl] remove nccl cumem env var override (#24141) Signed-off-by: youkaichao Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- docs/usage/troubleshooting.md | 2 +- vllm/env_override.py | 18 ------------------ 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index b92c6cef4a3fa..4945927e3d787 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -295,4 +295,4 @@ This indicates vLLM failed to initialize the NCCL communicator, possibly due to ## Known Issues - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759). -- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable `NCCL_CUMEM_ENABLE=0` to disable NCCL's `cuMem` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) . +- To address a memory overhead issue in older NCCL versions (see [bug](https://github.com/NVIDIA/nccl/issues/1234)), vLLM versions `>= 0.4.3, <= 0.10.1.1` would set the environment variable `NCCL_CUMEM_ENABLE=0`. External processes connecting to vLLM also needed to set this variable to prevent hangs or crashes. Since the underlying NCCL bug was fixed in NCCL 2.22.3, this override was removed in newer vLLM versions to allow for NCCL performance optimizations. diff --git a/vllm/env_override.py b/vllm/env_override.py index ef425d433320d..b06703a2fbf9d 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -13,24 +13,6 @@ logger = init_logger(__name__) # that interact with vllm workers. # they are executed whenever `import vllm` is called. -if os.environ.get('NCCL_CUMEM_ENABLE', '0') != '0': - logger.warning( - "NCCL_CUMEM_ENABLE is set to %s, skipping override. " - "This may increase memory overhead with cudagraph+allreduce: " - "https://github.com/NVIDIA/nccl/issues/1234", - os.environ['NCCL_CUMEM_ENABLE']) -elif not os.path.exists('/dev/nvidia-caps-imex-channels'): - # NCCL requires NCCL_CUMEM_ENABLE to work with - # multi-node NVLink, typically on GB200-NVL72 systems. - # The ultimate way to detect multi-node NVLink is to use - # NVML APIs, which are too expensive to call here. - # As an approximation, we check the existence of - # /dev/nvidia-caps-imex-channels, used by - # multi-node NVLink to communicate across nodes. - # This will still cost some GPU memory, but it is worthwhile - # because we can get very fast cross-node bandwidth with NVLink. - os.environ['NCCL_CUMEM_ENABLE'] = '0' - # see https://github.com/vllm-project/vllm/pull/15951 # it avoids unintentional cuda initialization from torch.cuda.is_available() os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1' From f0c503f66e2f6aafa966318d488fd92ac662cdf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Wed, 3 Sep 2025 09:19:54 +0200 Subject: [PATCH 51/95] [Nixl] Heterogeneous TP support FlashInfer (#20189) Signed-off-by: NickLucche --- .../kv_connector/v1/nixl_connector.py | 62 ++++++++++++++++--- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index efe023d5595e5..8f16babfe2aeb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -715,7 +715,7 @@ class NixlConnectorWorker: # are non-contiguous (it's not locally guaranteed that they will be) # Disadvantage is that the encoded NixlAgentMetadata is now larger # (roughly 8KB vs 5KB). - # Conversely for FlashInfer, K and V are transferred in the same tensor + # Conversely for FlashInfer, K and V are registered in the same region # to better exploit the memory layout (ie num_blocks is the first dim). split_k_and_v = not (self.use_mla or self._use_pallas_v1 or self._use_flashinfer) @@ -758,12 +758,21 @@ class NixlConnectorWorker: assert tensor_size_bytes % self.num_blocks == 0 self.block_len = tensor_size_bytes // self.num_blocks self.slot_size_bytes = self.block_len // self.block_size + self.device_kv_caches = kv_caches + self.dst_num_blocks[self.engine_id] = self.num_blocks if self._use_flashinfer: assert self.slot_size_bytes % 2 == 0 self.slot_size_bytes /= 2 - self.device_kv_caches = kv_caches - self.dst_num_blocks[self.engine_id] = self.num_blocks + # NOTE (NickLucche) When FlashInfer is used, memory is registered + # with joint KV for each block. This minimizes the overhead in + # registerMem allowing faster descs queries. In order to be able to + # split on kv_heads dim as required by heterogeneous TP, one must + # be able to index K/V separately. Hence the we double the number + # of 'virtual' regions here and halve `block_len` below. + self.num_regions *= 2 + + kv_block_len = self.get_backend_aware_kv_block_len() # Register local/src descr for NIXL xfer. blocks_data = [] for base_addr in seen_base_addresses: @@ -776,8 +785,18 @@ class NixlConnectorWorker: block_offset = block_id * self.block_len addr = base_addr + block_offset # (addr, len, device id) - # TODO: does device_id matter to DRAM? - blocks_data.append((addr, self.block_len, self.tp_rank)) + blocks_data.append((addr, kv_block_len, self.tp_rank)) + + if self._use_flashinfer: + # Separate and interleave K/V regions to maintain the same + # descs ordering. This is needed for selecting contiguous heads + # when split across TP ranks. + for block_id in range(self.num_blocks): + block_offset = block_id * self.block_len + addr = base_addr + block_offset + # Register addresses for V cache (K registered first). + v_addr = addr + kv_block_len + blocks_data.append((v_addr, kv_block_len, self.tp_rank)) logger.debug("Created %s blocks for src engine %s and rank %s", len(blocks_data), self.engine_id, self.tp_rank) @@ -903,7 +922,7 @@ class NixlConnectorWorker: remote_block_size = nixl_agent_meta.block_len // ( self.slot_size_bytes * tp_ratio) if self._use_flashinfer: - # Account for joint KV in FlashInfer. + # With flashinfer, KV are sent in the same message. remote_block_size //= 2 if tp_ratio > 1: # Heterogeneous TP expects same kv_cache_layout. @@ -929,10 +948,10 @@ class NixlConnectorWorker: # rank. With heterogeneous TP, prepare the descriptors by splitting the # P KV cache along kv_head dim, of D worker's kv_head size (D>P). # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..]. - # Only register the remote's descriptors if current rank pulls from it. self.kv_caches_base_addr[ engine_id] = nixl_agent_meta.kv_caches_base_addr - rank_offset = self.tp_rank % tp_ratio * self.block_len \ + kv_block_len = self.get_backend_aware_kv_block_len() + rank_offset = self.tp_rank % tp_ratio * kv_block_len \ if not (self.use_mla or is_kv_replicated) else 0 # Register all remote blocks, but only the corresponding kv heads. for base_addr in nixl_agent_meta.kv_caches_base_addr: @@ -943,7 +962,16 @@ class NixlConnectorWorker: # self.block_len == remote_block_len//tp_ratio bytes. addr = base_addr + block_offset + rank_offset # (addr, len, device id) - blocks_data.append((addr, self.block_len, remote_tp_rank)) + blocks_data.append((addr, kv_block_len, remote_tp_rank)) + + if self._use_flashinfer: + # With FlashInfer index V separately to allow head splitting. + for block_id in range(nixl_agent_meta.num_blocks): + block_offset = block_id * nixl_agent_meta.block_len + addr = base_addr + block_offset + rank_offset + v_addr = addr + nixl_agent_meta.block_len // 2 + blocks_data.append((v_addr, kv_block_len, remote_tp_rank)) + logger.debug( "Created %s blocks for dst engine %s with remote rank %s and " "local rank %s", len(blocks_data), engine_id, remote_tp_rank, @@ -1249,6 +1277,22 @@ class NixlConnectorWorker: descs_ids.append(reg_id * num_blocks + block_id) return descs_ids + def get_backend_aware_kv_block_len(self): + """ + Get the block length for one K/V element (K and V have the same size). + + For FA and other backends, this is equal to the length of the whole + block, as K and V are in separate regions. + For FlashInfer, this is half the length of the whole block, as K and V + share the same region. + """ + if self._use_flashinfer: + # For indexing only half (either just the K or V part). + block_len = self.block_len // 2 + else: + block_len = self.block_len + return block_len + @contextlib.contextmanager def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: From 70549c1245c3eeb3706e3c09a9e18d702fbf705f Mon Sep 17 00:00:00 2001 From: dsinghvi Date: Wed, 3 Sep 2025 13:43:11 +0530 Subject: [PATCH 52/95] [CI/Build] Serve images used by multimodal tests through local HTTP Server (#23907) Signed-off-by: Divyansh Singhvi Signed-off-by: dsinghvi Co-authored-by: Cyrus Leung --- tests/conftest.py | 122 ++++++++++++++++++ tests/entrypoints/llm/test_chat.py | 5 +- tests/entrypoints/openai/test_vision.py | 43 +++--- .../openai/test_vision_embedding.py | 19 +-- .../multimodal/generation/test_pixtral.py | 57 ++++---- tests/multimodal/test_utils.py | 31 +++-- .../openai/responses/test_image.py | 30 +++-- tests/v1/tpu/test_multimodal.py | 13 +- vllm/assets/image.py | 28 +++- 9 files changed, 250 insertions(+), 98 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 27db5422ceac2..1052aeb35bac7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import http.server import json import math +import mimetypes import os +import socket import tempfile +import threading +from collections.abc import Generator from enum import Enum from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast @@ -32,6 +37,7 @@ from vllm.distributed import (cleanup_dist_env_and_memory, from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, to_enc_dec_tuple_list, zip_enc_dec_prompts) from vllm.logger import init_logger +from vllm.multimodal.utils import fetch_image from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams from vllm.sequence import Logprob @@ -1253,3 +1259,119 @@ def cli_config_file(): def cli_config_file_with_model(): """Return the path to the CLI config file with model.""" return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml") + + +class AssetHandler(http.server.BaseHTTPRequestHandler): + # _IMAGE_CACHE : Dict[str, bytes] = {} + + def log_message(self, *args, **kwargs): + pass + + def do_GET(self): + # Accepts paths like: /1280px-Venn_diagram_rgb.jpg + filename = self.path.lstrip("/") + if not filename or "." not in filename: + self.send_error(404, "Missing filename (expected /.)") + return + + base, ext = filename.rsplit(".", 1) + ext = ext.lower() + + if ext not in ["jpg", "png"]: + self.send_error(404, f"Unsupported extension: .{ext}") + return + + try: + data = ImageAsset(base).read_bytes(ext=ext) + except Exception as e: + self.send_error(500, f"Failed to load asset: {ext} {base} {e} ") + return + + ctype, _ = mimetypes.guess_type(filename) + if ctype is None: + ctype = {"jpg": "image/jpg", "png": "image/png"}[ext] + self.send_response(200) + self.send_header("Content-Type", ctype) + self.send_header("Content-Length", str(len(data))) + self.end_headers() + self.wfile.write(data) + + +def _find_free_port() -> int: + with socket.socket() as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +class LocalAssetServer: + + address: str + port: int + server: Optional[http.server.ThreadingHTTPServer] + thread: Optional[threading.Thread] + + def __init__(self, address: str = "127.0.0.1") -> None: + self.address = address + self.port = -1 + self.server = None + self.thread = None + + def __enter__(self): + self.port = _find_free_port() + self.server = http.server.ThreadingHTTPServer( + (self.address, self.port), AssetHandler) + self.thread = threading.Thread(target=self.server.serve_forever, + daemon=True) + self.thread.start() + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self.server: + self.server.shutdown() + del self.server + + if self.thread: + self.thread.join() + del self.thread + + if exc_type is None: + return None + + return False + + @property + def base_url(self) -> str: + assert self.port is not None + return f"http://{self.address}:{self.port}" + + def url_for(self, name: str) -> str: + """e.g., name='RGBA_comp.png' -> 'http://127.0.0.1:PORT/RGBA_comp.png'""" + return f"{self.base_url}/{name}" + + def get_image_asset(self, name: str) -> Image.Image: + return fetch_image(self.url_for(name)) + + +@pytest.fixture(scope="session") +def local_asset_server() -> Generator[LocalAssetServer, None, None]: + """ + Starts a thread based HTTP server bound to 127.0.0.1 on a random free port. + The server currently servers images at: + http://127.0.0.1:/. + """ + with LocalAssetServer() as srv: + yield srv + + +@pytest.fixture +def image_url(request, local_asset_server) -> str: + # request.param is one of the IMAGE_ASSETS filenames + name = request.param + return local_asset_server.url_for(name) + + +@pytest.fixture +def image_urls(request, local_asset_server) -> list[str]: + """Indirect fixture: takes a list of names, returns list of full URLs.""" + names: list[str] = request.param + return [local_asset_server.url_for(name) for name in names] diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index 2cbfed98a577a..bf460d0fb25d3 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -7,7 +7,7 @@ import pytest from vllm import LLM from vllm.distributed import cleanup_dist_env_and_memory -from ..openai.test_vision import TEST_IMAGE_URLS +from ..openai.test_vision import TEST_IMAGE_ASSETS @pytest.fixture(scope="function") @@ -95,7 +95,8 @@ def vision_llm(): @pytest.mark.parametrize("image_urls", - [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) + [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]], + indirect=True) def test_chat_multi_image(vision_llm, image_urls: list[str]): messages = [{ "role": diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 106ec121a422e..9d61754059e2f 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -16,11 +16,11 @@ MODEL_NAME = "microsoft/Phi-3.5-vision-instruct" MAXIMUM_IMAGES = 2 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) -TEST_IMAGE_URLS = [ - "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", - "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", - "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", - "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +TEST_IMAGE_ASSETS = [ + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", ] EXPECTED_MM_BEAM_SEARCH_RES = [ @@ -69,10 +69,11 @@ async def client(server): @pytest.fixture(scope="session") -def base64_encoded_image() -> dict[str, str]: +def base64_encoded_image(local_asset_server) -> dict[str, str]: return { - image_url: encode_image_base64(fetch_image(image_url)) - for image_url in TEST_IMAGE_URLS + image_asset: + encode_image_base64(local_asset_server.get_image_asset(image_asset)) + for image_asset in TEST_IMAGE_ASSETS } @@ -97,7 +98,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): content_text = "What's in this image?" @@ -157,7 +158,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI, model_name: str, image_url: str): @@ -187,7 +188,7 @@ async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, model_name: str, image_url: str): @@ -223,10 +224,11 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image_base64encoded( - client: openai.AsyncOpenAI, model_name: str, image_url: str, - base64_encoded_image: dict[str, str]): + client: openai.AsyncOpenAI, model_name: str, raw_image_url: str, + image_url: str, base64_encoded_image: dict[str, str]): content_text = "What's in this image?" messages = [{ @@ -237,7 +239,7 @@ async def test_single_chat_session_image_base64encoded( "type": "image_url", "image_url": { "url": - f"data:image/jpeg;base64,{base64_encoded_image[image_url]}" + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" } }, { @@ -287,12 +289,12 @@ async def test_single_chat_session_image_base64encoded( @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_URLS)))) +@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS)))) async def test_single_chat_session_image_base64encoded_beamsearch( client: openai.AsyncOpenAI, model_name: str, image_idx: int, base64_encoded_image: dict[str, str]): # NOTE: This test also validates that we pass MM data through beam search - image_url = TEST_IMAGE_URLS[image_idx] + raw_image_url = TEST_IMAGE_ASSETS[image_idx] expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx] messages = [{ @@ -303,7 +305,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch( "type": "image_url", "image_url": { "url": - f"data:image/jpeg;base64,{base64_encoded_image[image_url]}" + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}" } }, { @@ -326,7 +328,7 @@ async def test_single_chat_session_image_base64encoded_beamsearch( @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_chat_streaming_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): messages = [{ @@ -385,7 +387,8 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", - [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))]) + [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], + indirect=True) async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]): diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index d3cc2fac6af57..dbd403fb7a7b5 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -19,11 +19,11 @@ vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja" assert vlm2vec_jinja_path.exists() # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) -TEST_IMAGE_URLS = [ - "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", - "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", - "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", - "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +TEST_IMAGE_ASSETS = [ + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", ] @@ -49,10 +49,11 @@ def server(): @pytest.fixture(scope="session") -def base64_encoded_image() -> dict[str, str]: +def base64_encoded_image(local_asset_server) -> dict[str, str]: return { - image_url: encode_image_base64(fetch_image(image_url)) - for image_url in TEST_IMAGE_URLS + image_url: + encode_image_base64(local_asset_server.get_image_asset(image_url)) + for image_url in TEST_IMAGE_ASSETS } @@ -70,7 +71,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, image_url: str): content_text = "Represent the given image." diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index d39cf706786e2..f95dbc7547ecc 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -29,10 +29,10 @@ MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID] IMG_URLS = [ - "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg", - "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/231-200x300.jpg", - "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/27-500x500.jpg", - "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/17-150x600.jpg", + "237-400x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg", + "231-200x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg", + "27-500x500.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg", + "17-150x600.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg", ] PROMPT = "Describe each image in one short sentence." @@ -105,12 +105,6 @@ def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt: return engine_inputs -MSGS = [ - _create_msg_format(IMG_URLS[:1]), - _create_msg_format(IMG_URLS[:2]), - _create_msg_format(IMG_URLS), -] - SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5) LIMIT_MM_PER_PROMPT = dict(image=4) @@ -156,12 +150,8 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs: @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN) @pytest.mark.parametrize("dtype", ["bfloat16"]) -def test_chat( - vllm_runner, - max_model_len: int, - model: str, - dtype: str, -) -> None: +def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str, + local_asset_server) -> None: EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs( FIXTURE_LOGPROBS_CHAT[model]) with vllm_runner( @@ -174,7 +164,14 @@ def test_chat( limit_mm_per_prompt=LIMIT_MM_PER_PROMPT, ) as vllm_model: outputs = [] - for msg in MSGS: + + urls_all = [local_asset_server.url_for(u) for u in IMG_URLS] + msgs = [ + _create_msg_format(urls_all[:1]), + _create_msg_format(urls_all[:2]), + _create_msg_format(urls_all), + ] + for msg in msgs: output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS) outputs.extend(output) @@ -190,14 +187,24 @@ def test_chat( name_1="output") -@pytest.mark.parametrize("prompt,expected_ranges", - [(_create_engine_inputs_hf(IMG_URLS[:1]), - [PlaceholderRange(offset=11, length=494)]), - (_create_engine_inputs_hf(IMG_URLS[1:4]), [ - PlaceholderRange(offset=11, length=266), - PlaceholderRange(offset=277, length=1056), - PlaceholderRange(offset=1333, length=418) - ])]) +@pytest.fixture +def prompt(request, local_asset_server) -> TextPrompt: + names = request.param + urls = [local_asset_server.url_for(n) for n in names] + return _create_engine_inputs_hf(urls) + + +@pytest.mark.parametrize( + "prompt,expected_ranges", + [ + pytest.param(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]), + pytest.param(IMG_URLS[1:4], [ + PlaceholderRange(offset=11, length=266), + PlaceholderRange(offset=277, length=1056), + PlaceholderRange(offset=1333, length=418) + ]) + ], +) def test_multi_modal_placeholders(vllm_runner, prompt: TextPrompt, expected_ranges: list[PlaceholderRange], monkeypatch) -> None: diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 0f82e1f3e343e..886582a516409 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -31,11 +31,11 @@ if TYPE_CHECKING: from vllm.multimodal.inputs import MultiModalPlaceholderDict # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) -TEST_IMAGE_URLS = [ - "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", - "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", - "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", - "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +TEST_IMAGE_ASSETS = [ + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", ] TEST_VIDEO_URLS = [ @@ -45,12 +45,11 @@ TEST_VIDEO_URLS = [ @pytest.fixture(scope="module") -def url_images() -> dict[str, Image.Image]: - connector = MediaConnector() +def url_images(local_asset_server) -> dict[str, Image.Image]: return { - image_url: connector.fetch_image(image_url) - for image_url in TEST_IMAGE_URLS + image_url: local_asset_server.get_image_asset(image_url) + for image_url in TEST_IMAGE_ASSETS } @@ -69,7 +68,7 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool: @pytest.mark.asyncio -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_fetch_image_http(image_url: str): connector = MediaConnector() @@ -79,12 +78,12 @@ async def test_fetch_image_http(image_url: str): @pytest.mark.asyncio -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS) @pytest.mark.parametrize("suffix", get_supported_suffixes()) async def test_fetch_image_base64(url_images: dict[str, Image.Image], - image_url: str, suffix: str): + raw_image_url: str, suffix: str): connector = MediaConnector() - url_image = url_images[image_url] + url_image = url_images[raw_image_url] try: mime_type = Image.MIME[Image.registered_extensions()[suffix]] @@ -117,7 +116,7 @@ async def test_fetch_image_base64(url_images: dict[str, Image.Image], @pytest.mark.asyncio -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_fetch_image_local_files(image_url: str): connector = MediaConnector() @@ -152,8 +151,8 @@ async def test_fetch_image_local_files(image_url: str): @pytest.mark.asyncio -async def test_fetch_image_local_files_with_space_in_name(): - image_url = TEST_IMAGE_URLS[0] +@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True) +async def test_fetch_image_local_files_with_space_in_name(image_url: str): connector = MediaConnector() with TemporaryDirectory() as temp_dir: diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/responses/test_image.py index c8d09fd39fb13..3ed36ca678c0c 100644 --- a/tests/v1/entrypoints/openai/responses/test_image.py +++ b/tests/v1/entrypoints/openai/responses/test_image.py @@ -8,17 +8,17 @@ import pytest import pytest_asyncio from tests.utils import RemoteOpenAIServer -from vllm.multimodal.utils import encode_image_base64, fetch_image +from vllm.multimodal.utils import encode_image_base64 # Use a small vision model for testing MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct" MAXIMUM_IMAGES = 2 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) -TEST_IMAGE_URLS = [ - "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", - "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", - "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", - "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +TEST_IMAGE_ASSETS = [ + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "Grayscale_8bits_palette_sample_image.png", # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "1280px-Venn_diagram_rgb.svg.png", # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "RGBA_comp.png", # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", ] @@ -52,16 +52,17 @@ async def client(image_server): @pytest.fixture(scope="session") -def base64_encoded_image() -> dict[str, str]: +def base64_encoded_image(local_asset_server) -> dict[str, str]: return { - image_url: encode_image_base64(fetch_image(image_url)) - for image_url in TEST_IMAGE_URLS + image_url: + encode_image_base64(local_asset_server.get_image_asset(image_url)) + for image_url in TEST_IMAGE_ASSETS } @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image(client: openai.AsyncOpenAI, model_name: str, image_url: str): content_text = "What's in this image?" @@ -91,11 +92,11 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS) async def test_single_chat_session_image_base64encoded( client: openai.AsyncOpenAI, model_name: str, - image_url: str, + raw_image_url: str, base64_encoded_image: dict[str, str], ): content_text = "What's in this image?" @@ -106,7 +107,7 @@ async def test_single_chat_session_image_base64encoded( { "type": "input_image", "image_url": - f"data:image/jpeg;base64,{base64_encoded_image[image_url]}", + f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}", "detail": "auto", }, { @@ -127,7 +128,8 @@ async def test_single_chat_session_image_base64encoded( @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", - [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))]) + [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], + indirect=True) async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]): messages = [{ diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py index bcc2993028dd6..9947fcbe73135 100644 --- a/tests/v1/tpu/test_multimodal.py +++ b/tests/v1/tpu/test_multimodal.py @@ -4,18 +4,19 @@ import openai import pytest -from vllm.multimodal.utils import encode_image_base64, fetch_image +from vllm.multimodal.utils import encode_image_base64 from vllm.platforms import current_platform -from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS +from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS from ...utils import RemoteOpenAIServer @pytest.fixture(scope="session") -def base64_encoded_image() -> dict[str, str]: +def base64_encoded_image(local_asset_server) -> dict[str, str]: return { - image_url: encode_image_base64(fetch_image(image_url)) - for image_url in TEST_IMAGE_URLS + image_asset: + encode_image_base64(local_asset_server.get_image_asset(image_asset)) + for image_asset in TEST_IMAGE_ASSETS } @@ -66,7 +67,7 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, client: openai.AsyncOpenAI = remote_server.get_async_client() # Other requests now should be much faster - for image_url in TEST_IMAGE_URLS: + for image_url in TEST_IMAGE_ASSETS: image_base64 = base64_encoded_image[image_url] chat_completion_from_base64 = await client.chat.completions\ .create( diff --git a/vllm/assets/image.py b/vllm/assets/image.py index c8f8d43a98355..4639a11187d03 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass +from pathlib import Path from typing import Literal import torch @@ -11,17 +12,29 @@ from .base import get_vllm_public_assets VLM_IMAGES_DIR = "vision_model_images" -ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato"] +ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato", + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk", + "Grayscale_8bits_palette_sample_image", + "1280px-Venn_diagram_rgb", "RGBA_comp", "237-400x300", + "231-200x300", "27-500x500", "17-150x600", + "handelsblatt-preview", "paper-11"] @dataclass(frozen=True) class ImageAsset: name: ImageAssetName + def get_path(self, ext: str) -> Path: + """ + Return s3 path for given image. + """ + return get_vllm_public_assets(filename=f"{self.name}.{ext}", + s3_prefix=VLM_IMAGES_DIR) + @property - def pil_image(self) -> Image.Image: - image_path = get_vllm_public_assets(filename=f"{self.name}.jpg", - s3_prefix=VLM_IMAGES_DIR) + def pil_image(self, ext="jpg") -> Image.Image: + + image_path = self.get_path(ext) return Image.open(image_path) @property @@ -29,6 +42,9 @@ class ImageAsset: """ Image embeddings, only used for testing purposes with llava 1.5. """ - image_path = get_vllm_public_assets(filename=f"{self.name}.pt", - s3_prefix=VLM_IMAGES_DIR) + image_path = self.get_path('pt') return torch.load(image_path, map_location="cpu", weights_only=True) + + def read_bytes(self, ext: str) -> bytes: + p = Path(self.get_path(ext)) + return p.read_bytes() From 9c99e4871f12aa57072b3dfd14157018d30e327b Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 3 Sep 2025 16:34:29 +0800 Subject: [PATCH 53/95] [Misc] Clean up deadcode for legacy processing pipeline (#24153) Signed-off-by: Isotr0py --- tests/models/multimodal/processing/test_tensor_schema.py | 3 --- vllm/multimodal/utils.py | 5 +---- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 1a11fa3d2b824..615564f70ea31 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -41,9 +41,6 @@ ARCH_NEEDS_EXTRAS = [ ] REPO_ID_TO_SKIP = { "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test", - # FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model - # after support PP for GPT-OSS - "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model", } ImageInput = list[Image.Image] diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 794e24c2c748c..e09c97de576ef 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -378,10 +378,7 @@ def group_mm_inputs_by_modality( elif len(mm_input) == 1: return next(iter(mm_input.keys())) - # FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty, - # this is used to make InternVL with legacy pipeline still work with v1. - else: - return "" + raise AssertionError("This line should be unreachable.") return [ list(group) for _, group in groupby(mm_inputs, key=modality_group_func) From 51383bd472747cfbc1f8f2e40767e1becef01f0f Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 3 Sep 2025 17:23:56 +0800 Subject: [PATCH 54/95] [CI] Accelerate mteb test by setting SentenceTransformers mteb score to a constant (#24088) Signed-off-by: wang.yuqi --- .../openai/correctness/test_mteb_embed.py | 4 ++- .../openai/correctness/test_mteb_score.py | 31 ++++++++-------- tests/models/language/pooling/embed_utils.py | 5 +-- tests/models/language/pooling/mteb_utils.py | 36 ++++++++++++------- tests/models/language/pooling/test_baai.py | 4 +++ .../pooling/test_bge_reranker_v2_gemma.py | 3 +- .../language/pooling/test_cross_encoder.py | 2 ++ .../models/language/pooling/test_embedding.py | 5 +-- tests/models/language/pooling/test_gte.py | 26 ++++++++------ .../models/language/pooling/test_intfloat.py | 4 ++- tests/models/language/pooling/test_jina.py | 2 ++ .../language/pooling/test_mxbai_rerank.py | 1 + tests/models/language/pooling/test_nomic.py | 2 ++ .../language/pooling/test_qwen3_reranker.py | 1 + .../pooling/test_snowflake_arctic_embed.py | 7 +++- .../language/pooling/test_st_projector.py | 1 + tests/models/utils.py | 1 + 17 files changed, 83 insertions(+), 52 deletions(-) diff --git a/tests/entrypoints/openai/correctness/test_mteb_embed.py b/tests/entrypoints/openai/correctness/test_mteb_embed.py index 783f7d3e0d5aa..1601c18d9b787 100644 --- a/tests/entrypoints/openai/correctness/test_mteb_embed.py +++ b/tests/entrypoints/openai/correctness/test_mteb_embed.py @@ -37,4 +37,6 @@ def test_mteb_embed(server): print("SentenceTransformer main score: ", st_main_score) print("Difference: ", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL) + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < MTEB_EMBED_TOL diff --git a/tests/entrypoints/openai/correctness/test_mteb_score.py b/tests/entrypoints/openai/correctness/test_mteb_score.py index cfb865815c9b2..417f85adc6e06 100644 --- a/tests/entrypoints/openai/correctness/test_mteb_score.py +++ b/tests/entrypoints/openai/correctness/test_mteb_score.py @@ -6,16 +6,19 @@ import pytest # yapf conflicts with isort for this block # yapf: disable -from tests.models.language.pooling.mteb_utils import ( - MTEB_RERANK_LANGS, MTEB_RERANK_TASKS, MTEB_RERANK_TOL, - RerankClientMtebEncoder, ScoreClientMtebEncoder, - mteb_test_rerank_models_hf, run_mteb_rerank) +from tests.models.language.pooling.mteb_utils import (MTEB_RERANK_LANGS, + MTEB_RERANK_TASKS, + MTEB_RERANK_TOL, + RerankClientMtebEncoder, + ScoreClientMtebEncoder, + run_mteb_rerank) # yapf: enable from tests.utils import RemoteOpenAIServer os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2" +st_main_score = 0.33457 @pytest.fixture(scope="module") @@ -29,15 +32,7 @@ def server(): yield remote_server -@pytest.fixture(scope="module") -def st_main_score(hf_runner): - # The main score related to the version of the dependency. - # So we need to recalculate every time. - main_score, st_dtype = mteb_test_rerank_models_hf(hf_runner, MODEL_NAME) - return main_score - - -def test_mteb_score(server, st_main_score): +def test_mteb_score(server): url = server.url_for("score") encoder = ScoreClientMtebEncoder(MODEL_NAME, url) vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, @@ -47,10 +42,12 @@ def test_mteb_score(server, st_main_score): print("SentenceTransformer main score: ", st_main_score) print("Difference: ", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL) + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < MTEB_RERANK_TOL -def test_mteb_rerank(server, st_main_score): +def test_mteb_rerank(server): url = server.url_for("rerank") encoder = RerankClientMtebEncoder(MODEL_NAME, url) vllm_main_score = run_mteb_rerank(encoder, MTEB_RERANK_TASKS, @@ -60,4 +57,6 @@ def test_mteb_rerank(server, st_main_score): print("SentenceTransformer main score: ", st_main_score) print("Difference: ", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL) + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < MTEB_RERANK_TOL diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py index a74ad2aa25972..8f8393c4e16fc 100644 --- a/tests/models/language/pooling/embed_utils.py +++ b/tests/models/language/pooling/embed_utils.py @@ -35,10 +35,7 @@ def correctness_test_embed_models(hf_runner, example_prompts, vllm_extra_kwargs=None, hf_model_callback=None): - if not model_info.enable_test: - # A model family has many models with the same architecture, - # and we don't need to test each one. - pytest.skip("Skipping test.") + pytest.skip("Debug only, ci prefers to use mteb test.") # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 640858125bfca..7be1bba2ff69f 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -18,7 +18,7 @@ from tests.models.utils import EmbedModelInfo, RerankModelInfo # - Different model results in differences more than 1e-3 # 1e-4 is a good tolerance threshold MTEB_EMBED_TASKS = ["STS12"] -MTEB_EMBED_TOL = 0.02 +MTEB_EMBED_TOL = 1e-4 # See #19344 MTEB_RERANK_TASKS = ["NFCorpus"] @@ -192,22 +192,28 @@ def mteb_test_embed_models(hf_runner, MTEB_EMBED_TASKS) vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype - with hf_runner(model_info.name, - is_sentence_transformer=True, - dtype="float32") as hf_model: + if model_info.mteb_score is None: + with hf_runner(model_info.name, + is_sentence_transformer=True, + dtype="float32") as hf_model: - if hf_model_callback is not None: - hf_model_callback(hf_model) + if hf_model_callback is not None: + hf_model_callback(hf_model) - st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) - st_dtype = next(hf_model.model.parameters()).dtype + st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) + st_dtype = next(hf_model.model.parameters()).dtype + else: + st_main_score = model_info.mteb_score + st_dtype = "Constant" print("Model:", model_info.name) print("VLLM:", vllm_dtype, vllm_main_score) print("SentenceTransformers:", st_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, abs=atol) + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < atol def run_mteb_rerank(cross_encoder, tasks, languages): @@ -310,12 +316,18 @@ def mteb_test_rerank_models(hf_runner, languages=MTEB_RERANK_LANGS) vllm_dtype = model_config.dtype - st_main_score, st_dtype = mteb_test_rerank_models_hf( - hf_runner, model_info.name, hf_model_callback) + if model_info.mteb_score is None: + st_main_score, st_dtype = mteb_test_rerank_models_hf( + hf_runner, model_info.name, hf_model_callback) + else: + st_main_score = model_info.mteb_score + st_dtype = "Constant" print("Model:", model_info.name) print("VLLM:", vllm_dtype, vllm_main_score) print("SentenceTransformers:", st_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, abs=atol) + # We are not concerned that the vllm mteb results are better + # than SentenceTransformers, so we only perform one-sided testing. + assert st_main_score - vllm_main_score < atol diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py index 6fbe0e82d7f8a..be8cb6fa76994 100644 --- a/tests/models/language/pooling/test_baai.py +++ b/tests/models/language/pooling/test_baai.py @@ -12,6 +12,7 @@ MODELS = [ ########## BertModel CLSPoolingEmbedModelInfo("BAAI/bge-base-en", architecture="BertModel", + mteb_score=0.779336792, enable_test=True), CLSPoolingEmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", @@ -52,10 +53,12 @@ MODELS = [ ########## XLMRobertaModel CLSPoolingEmbedModelInfo("BAAI/bge-m3", architecture="XLMRobertaModel", + mteb_score=0.787343078, enable_test=True), ########## Qwen2Model LASTPoolingEmbedModelInfo("BAAI/bge-code-v1", architecture="Qwen2Model", + mteb_score=0.75724465, dtype="float32", enable_test=True), ] @@ -65,6 +68,7 @@ RERANK_MODELS = [ CLSPoolingRerankModelInfo( "BAAI/bge-reranker-base", architecture="XLMRobertaForSequenceClassification", + mteb_score=0.32398, enable_test=True), CLSPoolingRerankModelInfo( "BAAI/bge-reranker-large", diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py index f473e0ba01ffa..eaa8bfb84ffdd 100644 --- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py @@ -104,7 +104,6 @@ class GemmaMtebEncoder(VllmMtebEncoder): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.prompt = PROMPT self.query_template = "A: {query}\n" self.document_template = "B: {doc}\n{prompt}" @@ -119,7 +118,7 @@ class GemmaMtebEncoder(VllmMtebEncoder): _sentences = [] for query, corpus, prompt in sentences: query = self.query_template.format(query=query) - corpus = self.document_template.format(doc=corpus, prompt=prompt) + corpus = self.document_template.format(doc=corpus, prompt=PROMPT) _sentences.append((query, corpus, prompt)) return super().predict(_sentences, *args, **kwargs) diff --git a/tests/models/language/pooling/test_cross_encoder.py b/tests/models/language/pooling/test_cross_encoder.py index 8c1bc5779b8a1..b49908c9ce6a6 100644 --- a/tests/models/language/pooling/test_cross_encoder.py +++ b/tests/models/language/pooling/test_cross_encoder.py @@ -8,8 +8,10 @@ from .mteb_utils import mteb_test_rerank_models RERANK_MODELS = [ CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2", + mteb_score=0.32898, architecture="BertForSequenceClassification"), LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", + mteb_score=0.25736, architecture="Qwen3ForSequenceClassification") ] diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index f918b2b91bcc3..0733ac85c11fc 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -7,7 +7,7 @@ import pytest from vllm.config import PoolerConfig from vllm.platforms import current_platform -from ...utils import check_embeddings_close, check_transformers_version +from ...utils import check_embeddings_close @pytest.mark.parametrize( @@ -30,7 +30,6 @@ from ...utils import check_embeddings_close, check_transformers_version pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-small"), - pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), # [Cross-Encoder] pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], @@ -42,8 +41,6 @@ def test_models( model, monkeypatch, ) -> None: - if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - check_transformers_version(model, max_transformers_version="4.53.2") if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm(): # ROCm Triton FA does not currently support sliding window attention diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 9911620c018ef..98d215b0ad25e 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -5,13 +5,14 @@ import pytest from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo, EmbedModelInfo, LASTPoolingEmbedModelInfo, - RerankModelInfo, check_transformers_version) + RerankModelInfo) from .embed_utils import correctness_test_embed_models from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models MODELS = [ ########## BertModel CLSPoolingEmbedModelInfo("thenlper/gte-large", + mteb_score=0.76807651, architecture="BertModel", enable_test=True), CLSPoolingEmbedModelInfo("thenlper/gte-base", @@ -30,28 +31,37 @@ MODELS = [ architecture="BertModel", enable_test=False), ########### NewModel + # These three architectures are almost the same, but not exactly the same. + # For example, + # - whether to use token_type_embeddings + # - whether to use context expansion + # So only test one (the most widely used) model CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base", architecture="GteNewModel", + mteb_score=0.775074696, hf_overrides={"architectures": ["GteNewModel"]}, enable_test=True), CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5", architecture="GteNewModel", hf_overrides={"architectures": ["GteNewModel"]}, - enable_test=True), + enable_test=False), CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5", architecture="GteNewModel", hf_overrides={"architectures": ["GteNewModel"]}, - enable_test=True), + enable_test=False), ########### Qwen2ForCausalLM LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", + mteb_score=0.758473459018872, architecture="Qwen2ForCausalLM", enable_test=True), ########## ModernBertModel CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base", + mteb_score=0.748193353, architecture="ModernBertModel", enable_test=True), ########## Qwen3ForCausalLM LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B", + mteb_score=0.771163695, architecture="Qwen3ForCausalLM", dtype="float32", enable_test=True), @@ -65,10 +75,12 @@ RERANK_MODELS = [ CLSPoolingRerankModelInfo( # classifier_pooling: mean "Alibaba-NLP/gte-reranker-modernbert-base", + mteb_score=0.33386, architecture="ModernBertForSequenceClassification", enable_test=True), CLSPoolingRerankModelInfo( "Alibaba-NLP/gte-multilingual-reranker-base", + mteb_score=0.33062, architecture="GteNewForSequenceClassification", hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, enable_test=True), @@ -78,10 +90,6 @@ RERANK_MODELS = [ @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: - if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - check_transformers_version(model_info.name, - max_transformers_version="4.53.2") - mteb_test_embed_models(hf_runner, vllm_runner, model_info) @@ -89,10 +97,6 @@ def test_embed_models_mteb(hf_runner, vllm_runner, def test_embed_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo, example_prompts) -> None: - if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct": - check_transformers_version(model_info.name, - max_transformers_version="4.53.2") - correctness_test_embed_models(hf_runner, vllm_runner, model_info, example_prompts) diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py index 6cae53a660ad8..bc95475836e87 100644 --- a/tests/models/language/pooling/test_intfloat.py +++ b/tests/models/language/pooling/test_intfloat.py @@ -10,6 +10,7 @@ MODELS = [ ########## BertModel CLSPoolingEmbedModelInfo("intfloat/e5-small", architecture="BertModel", + mteb_score=0.742285423, enable_test=True), CLSPoolingEmbedModelInfo("intfloat/e5-base", architecture="BertModel", @@ -23,6 +24,7 @@ MODELS = [ ########## XLMRobertaModel CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base", architecture="XLMRobertaModel", + mteb_score=0.779325955, enable_test=True), CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large", architecture="XLMRobertaModel", @@ -36,7 +38,7 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: - mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02) + mteb_test_embed_models(hf_runner, vllm_runner, model_info) @pytest.mark.parametrize("model_info", MODELS) diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 37c5bdc97dd98..c4e4835556a54 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -14,6 +14,7 @@ from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models EMBEDDING_MODELS = [ CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3", + mteb_score=0.824413164, architecture="XLMRobertaModel", is_matryoshka=True) ] @@ -21,6 +22,7 @@ EMBEDDING_MODELS = [ RERANK_MODELS = [ CLSPoolingRerankModelInfo( "jinaai/jina-reranker-v2-base-multilingual", + mteb_score=0.33643, architecture="XLMRobertaForSequenceClassification") ] diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py index 73823deeff4e0..1731c6ae6fff7 100644 --- a/tests/models/language/pooling/test_mxbai_rerank.py +++ b/tests/models/language/pooling/test_mxbai_rerank.py @@ -20,6 +20,7 @@ RERANK_MODELS = [ LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2", architecture="Qwen2ForSequenceClassification", hf_overrides=mxbai_rerank_hf_overrides, + mteb_score=0.273, enable_test=True), LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2", architecture="Qwen2ForSequenceClassification", diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py index 2d05958e9bcda..52a8ce6e6671f 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling/test_nomic.py @@ -10,6 +10,7 @@ from .mteb_utils import mteb_test_embed_models MODELS = [ CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1", architecture="NomicBertModel", + mteb_score=0.737568559, enable_test=True), CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5", architecture="NomicBertModel", @@ -19,6 +20,7 @@ MODELS = [ enable_test=False), CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe", architecture="NomicBertModel", + mteb_score=0.715488912, enable_test=True) ] diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py index 5dd2d9eae9115..ebdacf9d0c673 100644 --- a/tests/models/language/pooling/test_qwen3_reranker.py +++ b/tests/models/language/pooling/test_qwen3_reranker.py @@ -20,6 +20,7 @@ qwen3_reranker_hf_overrides = { RERANK_MODELS = [ LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B", architecture="Qwen3ForSequenceClassification", + mteb_score=0.25736, hf_overrides=qwen3_reranker_hf_overrides, enable_test=True), LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B", diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py index c22c78592e535..864f3d75ef5aa 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -11,6 +11,7 @@ MODELS = [ CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs", is_matryoshka=False, architecture="BertModel", + mteb_score=0.714927797, enable_test=True), CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s", is_matryoshka=False, @@ -23,6 +24,7 @@ MODELS = [ CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long", is_matryoshka=False, architecture="NomicBertModel", + mteb_score=0.681146831, enable_test=True), CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l", is_matryoshka=False, @@ -31,14 +33,17 @@ MODELS = [ CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5", is_matryoshka=True, architecture="BertModel", + mteb_score=0.649088363, enable_test=True), CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0", is_matryoshka=True, architecture="XLMRobertaModel", + mteb_score=0.712258299, enable_test=True), CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0", is_matryoshka=True, architecture="GteModel", + mteb_score=0.706622444, enable_test=True), ] @@ -46,7 +51,7 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: - mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02) + mteb_test_embed_models(hf_runner, vllm_runner, model_info) @pytest.mark.parametrize("model_info", MODELS) diff --git a/tests/models/language/pooling/test_st_projector.py b/tests/models/language/pooling/test_st_projector.py index 51ddbcc5ab249..bafeb4060d80a 100644 --- a/tests/models/language/pooling/test_st_projector.py +++ b/tests/models/language/pooling/test_st_projector.py @@ -10,6 +10,7 @@ ST_PROJECTOR_MODELS = [ CLSPoolingEmbedModelInfo( "TencentBAC/Conan-embedding-v1", architecture="BertModel", + mteb_score=0.688611955, enable_test=True, ), ] diff --git a/tests/models/utils.py b/tests/models/utils.py index 0fb1f5b3753b5..40a41afff8287 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -347,6 +347,7 @@ class ModelInfo: dtype: str = "auto" hf_overrides: Optional[dict[str, Any]] = None default_pooling_type: str = "" + mteb_score: Optional[float] = None enable_test: bool = True From 28f350e147c4b5c050c0080ef0d924c15ab87635 Mon Sep 17 00:00:00 2001 From: Jakub Smid <90085992+biba10@users.noreply.github.com> Date: Wed, 3 Sep 2025 12:47:55 +0200 Subject: [PATCH 55/95] Support add_generation_prompt in embeddings endpoint with chat request (#23931) Signed-off-by: biba10 --- vllm/entrypoints/openai/protocol.py | 8 ++++++++ vllm/entrypoints/openai/serving_embedding.py | 4 +--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 4881022325625..413e1dd8d6337 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1342,6 +1342,14 @@ class EmbeddingChatRequest(OpenAIBaseModel): truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None # --8<-- [start:chat-embedding-extra-params] + add_generation_prompt: bool = Field( + default=False, + description= + ("If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model."), + ) + add_special_tokens: bool = Field( default=False, description=( diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 0a0d98db2d0d8..c6d3509afda74 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -93,9 +93,7 @@ class EmbeddingMixin(OpenAIServing): or ctx.chat_template, chat_template_content_format=ctx. chat_template_content_format, - # In embedding requests, we are not generating tokens, - # so there is no need to append extra tokens to the input - add_generation_prompt=False, + add_generation_prompt=ctx.request.add_generation_prompt, continue_final_message=False, add_special_tokens=ctx.request.add_special_tokens, ) From 6997a25ac65ed6cc3c2be6d09ca45f633a345f63 Mon Sep 17 00:00:00 2001 From: qscqesze Date: Wed, 3 Sep 2025 19:27:04 +0800 Subject: [PATCH 56/95] [Model] Remove useless code from MiniMax implementation (#23982) Signed-off-by: QscQ Signed-off-by: qingjun --- vllm/model_executor/layers/mamba/linear_attn.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py index d93cef1a27ad4..5fe37a6289e01 100644 --- a/vllm/model_executor/layers/mamba/linear_attn.py +++ b/vllm/model_executor/layers/mamba/linear_attn.py @@ -83,17 +83,7 @@ class MiniMaxText01RMSNormTP(CustomOp): variance = tensor_model_parallel_all_reduce( variance) / self.tp_world x = x * torch.rsqrt(variance + self.variance_epsilon) - - weight = self.weight - if x.size(-1) != self.weight.size(0): - if self.weight.size(0) < x.size(-1): - repeat_count = (x.size(-1) + self.weight.size(0)) // x.size(-1) - full_weight = self.weight.repeat(repeat_count) - weight = full_weight[:x.size(-1)] - else: - weight = self.weight[:x.size(-1)] - - x = x.to(orig_dtype) * weight + x = x.to(orig_dtype) * self.weight return x def forward( From 4ba0c587ba3ad2ab419ba6f43a2d52946c58d530 Mon Sep 17 00:00:00 2001 From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com> Date: Wed, 3 Sep 2025 22:17:20 +0800 Subject: [PATCH 57/95] FIX: Add libnuma-dev to Dockerfile for dev stage (#20388) Signed-off-by: dongbo910220 <1275604947@qq.com> --- docker/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 75e8fa49f86c9..01b7aa0f44afd 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -261,6 +261,8 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" # Use copy mode to avoid hardlink failures with Docker cache mounts ENV UV_LINK_MODE=copy +# Install libnuma-dev, required by fastsafetensors (fixes #20384) +RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/* COPY requirements/lint.txt requirements/lint.txt COPY requirements/test.txt requirements/test.txt COPY requirements/dev.txt requirements/dev.txt From 6d80ae83e1455cb0e47196cea557398fde0f03d1 Mon Sep 17 00:00:00 2001 From: Burkhard Ringlein Date: Wed, 3 Sep 2025 17:01:09 +0200 Subject: [PATCH 58/95] [Bugfix] Fixing division by zero in triton_attn if query_heads/kv_heads > 16 (#23424) Signed-off-by: Burkhard Ringlein --- vllm/attention/ops/triton_unified_attention.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 56ebed0f52448..250e9b3890444 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -674,7 +674,8 @@ def unified_attention( num_queries_per_kv = num_query_heads // num_kv_heads head_size = q.shape[2] - BLOCK_M = 16 + BLOCK_M = 16 if num_queries_per_kv <= 16 else triton.next_power_of_2( + num_queries_per_kv) BLOCK_Q = BLOCK_M // num_queries_per_kv # Ideally we would launch with kernel with: From fa4311d85f3a5f6d445f20429ca6d38122908eeb Mon Sep 17 00:00:00 2001 From: nopperl <54780682+nopperl@users.noreply.github.com> Date: Thu, 4 Sep 2025 00:24:02 +0900 Subject: [PATCH 59/95] [V1] v1 engine + full CUDA graph support for PLaMo2 (#23998) Signed-off-by: Hemmi Shinichi Signed-off-by: nopperl <54780682+nopperl@users.noreply.github.com> Co-authored-by: Hemmi Shinichi Co-authored-by: Thomas Parnell --- docs/models/supported_models.md | 2 +- docs/usage/v1_guide.md | 2 +- .../models/language/generation/test_hybrid.py | 5 +- tests/models/registry.py | 2 - vllm/config/compilation.py | 1 + vllm/model_executor/models/plamo2.py | 462 +++++++++++++----- 6 files changed, 349 insertions(+), 125 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 4b4cebb6a31c2..7f54d98527686 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -395,7 +395,7 @@ th { | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Phi4FlashForCausalLM` | Phi-4-mini-flash-reasoning | `microsoft/microsoft/Phi-4-mini-instruct`, etc. | | | | | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ | -| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | | +| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | ✅︎ | | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index f71805436a6ae..525f740d12a7f 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -110,7 +110,7 @@ Models using selective state-space mechanisms instead of standard transformer at Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported. Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, -`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). +`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`, `Plamo2ForCausalLM`). Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`, `Lfm2ForCausalLM`). diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 9e97e3fa65775..b44ddc61b6c8c 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -25,8 +25,7 @@ SSM_MODELS = [ HYBRID_MODELS = [ "ai21labs/Jamba-tiny-dev", - # skipping until vLLM implementation issues are resolved - # "pfnet/plamo-2-1b", + "pfnet/plamo-2-1b", "Zyphra/Zamba2-1.2B-instruct", "hmellor/tiny-random-BambaForCausalLM", "ibm-granite/granite-4.0-tiny-preview", @@ -37,6 +36,7 @@ HYBRID_MODELS = [ V1_SUPPORTED_MODELS = [ "state-spaces/mamba-130m-hf", "ai21labs/Jamba-tiny-dev", + "pfnet/plamo-2-1b", "yujiepan/mamba2-codestral-v0.1-tiny-random", "Zyphra/Zamba2-1.2B-instruct", "hmellor/tiny-random-BambaForCausalLM", @@ -47,6 +47,7 @@ V1_SUPPORTED_MODELS = [ FULL_CUDA_GRAPH_MODELS = [ "ai21labs/Jamba-tiny-dev", + "pfnet/plamo-2-1b", "Zyphra/Zamba2-1.2B-instruct", ] diff --git a/tests/models/registry.py b/tests/models/registry.py index 4cf3dd6e08ced..f1f61c6151349 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -287,8 +287,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct", trust_remote_code=True), "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b", - max_transformers_version="4.53", - transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings", # noqa: E501 trust_remote_code=True), "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat", max_transformers_version="4.53", diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 28ad3d2f535d3..677fb069bc07a 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -340,6 +340,7 @@ class CompilationConfig: "vllm.mamba_mixer", "vllm.short_conv", "vllm.linear_attention", + "vllm.plamo2_mamba_mixer", ] def compute_hash(self) -> str: diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 7f70e44b10a6d..b9869f5e58800 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -3,19 +3,24 @@ """Inference-only PLaMo2 model.""" from collections.abc import Iterable from itertools import islice -from typing import Optional +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend import torch from torch import nn -from transformers import PretrainedConfig, PreTrainedModel +from transformers import PretrainedConfig +from vllm import envs from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile -from vllm.config import VllmConfig +from vllm.config import VllmConfig, get_current_vllm_config from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group -from vllm.forward_context import get_forward_context +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -23,8 +28,11 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.abstract import MambaBase from vllm.model_executor.layers.mamba.mamba2_metadata import ( - Mamba2Metadata, prepare_mamba2_metadata) + Mamba2Metadata, prepare_mamba2_metadata, update_metadata) +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateDtypeCalculator, MambaStateShapeCalculator) from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_fn, causal_conv1d_update) from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( @@ -39,7 +47,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.model_loader.weight_utils import ( composed_weight_loader, default_weight_loader, sharded_weight_loader) from vllm.model_executor.models.interfaces import (HasInnerState, IsHybrid, - SupportsPP, SupportsV0Only) + SupportsPP) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) from vllm.model_executor.models.utils import ( @@ -47,8 +55,10 @@ from vllm.model_executor.models.utils import ( make_layers, maybe_prefix) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import LayerBlockType +from vllm.utils import LayerBlockType, direct_register_custom_op +from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata # Only used for type hinting. @@ -73,20 +83,6 @@ class Plamo2Config(PretrainedConfig): # type: ignore vocab_size: int -class Plamo2PreTrainedModel(PreTrainedModel): # type: ignore - - def _init_weights(self, module: torch.nn.Module) -> None: - std = 0.02 - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - def is_mamba(config: Plamo2Config, i: int) -> bool: assert config.mamba_step > 1 @@ -99,7 +95,8 @@ def is_mamba(config: Plamo2Config, i: int) -> bool: # Adapted from: # vllm.model_executor.layers.mamba.mamba_mixer2.MambaMixer2 # transformers.models.mamba.modeling_mamba.MambaMixer -class Plamo2MambaMixer(nn.Module): +@CustomOp.register(name="plamo2_mamba_mixer") +class Plamo2MambaMixer(MambaBase, CustomOp): def __init__(self, vllm_config: VllmConfig, @@ -108,6 +105,8 @@ class Plamo2MambaMixer(nn.Module): **kwargs) -> None: super().__init__() self.config = vllm_config.model_config.hf_config + self.cache_config = vllm_config.cache_config + self.model_config = vllm_config.model_config self.quant_config = vllm_config.quant_config self.hidden_size = self.config.hidden_size self.ssm_state_size = self.config.mamba_d_state @@ -115,8 +114,6 @@ class Plamo2MambaMixer(nn.Module): self.intermediate_size = (self.config.mamba_num_heads * self.config.hidden_size_per_head) self.tp_size = get_tensor_model_parallel_world_size() - self.intermediate_size_per_tp_worker = \ - self.intermediate_size // self.tp_size self.head_dim = self.config.hidden_size_per_head self.num_heads = self.config.mamba_num_heads self.time_step_rank = max(64, self.hidden_size // 16) @@ -197,6 +194,22 @@ class Plamo2MambaMixer(nn.Module): self.C_norm = RMSNorm(self.ssm_state_size, eps=self.config.rms_norm_eps) + self.chunk_size = self.config.mamba_chunk_size + + if envs.VLLM_USE_V1: + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + # The outer list is for v0 PP virtual engine. Though this code path + # only runs for v1, we have to do this to unify with the interface + # of Attention + v0 PP. + # The inner tuple is (conv_state, ssm_state) + self.kv_cache = [(torch.tensor([]), torch.tensor([]))] + assert self.chunk_size != -1, "chunk_size must be set for v1" + + self.prefix = prefix + def _project_ssm_parameters(self, hidden_states): ssm_parameters = self.bcdt_proj(hidden_states) B, C, time_step = torch.split( @@ -212,25 +225,76 @@ class Plamo2MambaMixer(nn.Module): dt = self.dt_proj(time_step) return B, C, dt - def forward( + def forward_native( self, hidden_states: torch.Tensor, + output: torch.Tensor, mamba_cache_params: MambaCacheParams, mamba2_metadata: Mamba2Metadata, **kwargs, - ) -> torch.Tensor: + ): + pass + def forward( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + mamba_cache_params: MambaCacheParams, + mamba2_metadata: Mamba2Metadata, + **kwargs, + ): + if not envs.VLLM_USE_V1: + CustomOp.forward(self, hidden_states, output, mamba_cache_params, + mamba2_metadata) + else: + torch.ops.vllm.plamo2_mamba_mixer( + hidden_states, + output, + self.prefix, + ) + + def forward_cuda( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + mamba_cache_params: MambaCacheParams, + mamba2_metadata: Mamba2Metadata, + **kwargs, + ): + + forward_context = get_forward_context() # mamba2_metadata contains metadata necessary for the mamba2 triton # kernels to operate in continuous batching and in chunked prefill # modes; they are computed at top-level model forward since they # stay the same and reused for all mamba layers in the same iteration - attn_metadata: AttentionMetadata = get_forward_context().attn_metadata - - num_prefills = attn_metadata.num_prefills # request count - num_decodes = attn_metadata.num_decode_tokens # token count (=request) - num_prefill_tokens = attn_metadata.num_prefill_tokens # token count - has_prefill = num_prefills > 0 - has_decode = num_decodes > 0 + attn_metadata: AttentionMetadata = forward_context.attn_metadata + if envs.VLLM_USE_V1: + if attn_metadata is not None: + assert isinstance(attn_metadata, dict) + attn_metadata = attn_metadata[self.prefix] + mamba2_metadata = attn_metadata + assert isinstance(attn_metadata, Mamba2AttentionMetadata) + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + # conv_state = (..., dim, width-1) yet contiguous along 'dim' + conv_state = self_kv_cache[0].transpose(-1, -2) + ssm_state = self_kv_cache[1] + state_indices_tensor = attn_metadata.state_indices_tensor + has_initial_states_p = attn_metadata.has_initial_states_p + prep_initial_states = attn_metadata.prep_initial_states + chunk_size = attn_metadata.chunk_size + seq_idx_p = attn_metadata.seq_idx_p + chunk_indices_p = attn_metadata.chunk_indices_p + chunk_offsets_p = attn_metadata.chunk_offsets_p + else: + conv_state = mamba_cache_params.conv_state + ssm_state = mamba_cache_params.ssm_state + state_indices_tensor = mamba_cache_params.state_indices_tensor + has_initial_states_p = mamba2_metadata.has_initial_states + prep_initial_states = mamba2_metadata.prep_initial_states + chunk_size = mamba2_metadata.chunk_size + seq_idx_p = mamba2_metadata.seq_idx + chunk_indices_p = mamba2_metadata.chunk_indices + chunk_offsets_p = mamba2_metadata.chunk_offsets # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states) @@ -240,23 +304,59 @@ class Plamo2MambaMixer(nn.Module): conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2)) + if envs.VLLM_USE_V1 and attn_metadata is None: + # V1 profile run + hidden_states = (hidden_states.transpose(0, 1).clone().transpose( + 0, 1)).contiguous() + output[:] = self.out_proj(hidden_states) + return + + num_prefills = attn_metadata.num_prefills # request count + num_decodes = attn_metadata.num_decode_tokens # token count (=request) + num_prefill_tokens = attn_metadata.num_prefill_tokens # token count + has_prefill = num_prefills > 0 + has_decode = num_decodes > 0 + num_actual_tokens = num_prefill_tokens + num_decodes + + # NOTE: V0 put prefill before decode, v1 puts decode before prefill # Separate prefill and decode by splitting varlen input # Split along token dimension - hidden_states_p, hidden_states_d = torch.split( - hidden_states, - [num_prefill_tokens, num_decodes], - dim=0, - ) - gate_p, gate_d = torch.split(gate, [num_prefill_tokens, num_decodes], - dim=0) - # Split along batch dimension - state_indices_tensor_p, state_indices_tensor_d = torch.split( - mamba_cache_params.state_indices_tensor, - [num_prefills, num_decodes], - dim=0, - ) - query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills + 1] - if has_prefill else None) + if envs.VLLM_USE_V1: + hidden_states_d, hidden_states_p = torch.split( + hidden_states[:num_actual_tokens], + [num_decodes, num_prefill_tokens], + dim=0, + ) + gate_d, gate_p = torch.split(gate[:num_actual_tokens], + [num_decodes, num_prefill_tokens], + dim=0) + # Split along batch dimension + state_indices_tensor_d, state_indices_tensor_p = torch.split( + state_indices_tensor, + [num_decodes, num_prefills], + dim=0, + ) + query_start_loc_p = ( + attn_metadata.query_start_loc[-num_prefills - 1:] - + num_decodes if has_prefill else None) + else: + hidden_states_p, hidden_states_d = torch.split( + hidden_states, + [num_prefill_tokens, num_decodes], + dim=0, + ) + gate_p, gate_d = torch.split(gate, + [num_prefill_tokens, num_decodes], + dim=0) + # Split along batch dimension + state_indices_tensor_p, state_indices_tensor_d = torch.split( + state_indices_tensor, + [num_prefills, num_decodes], + dim=0, + ) + query_start_loc_p = (attn_metadata.query_start_loc[:num_prefills + + 1] + if has_prefill else None) # Preallocate output tensor to avoid memcpy cost for merging prefill # and decode outputs @@ -268,25 +368,38 @@ class Plamo2MambaMixer(nn.Module): dtype=hidden_states.dtype, device=hidden_states.device, ) - preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split( - preallocated_ssm_out, - [num_prefill_tokens, num_decodes], - dim=0, - ) + if envs.VLLM_USE_V1: + preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split( + preallocated_ssm_out, + [num_decodes, num_prefill_tokens], + dim=0, + ) + else: + preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split( + preallocated_ssm_out, + [num_prefill_tokens, num_decodes], + dim=0, + ) # Process prefill requests if has_prefill: # 2. Convolution sequence transformation # - "cache_indices" updates the conv_state cache in positions - # pointed to by "mamba_cache_params.state_indices_tensor" + # pointed to by "state_indices_tensor" + x = hidden_states_p.transpose( + 0, 1) # this is the form that causal-conv see + if mamba2_metadata.cu_seqlen is None: + mamba2_metadata = update_metadata(x, query_start_loc_p, + mamba2_metadata) hidden_states_p = causal_conv1d_fn( - hidden_states_p.transpose(0, 1), + x, conv_weights, self.conv1d.bias, activation=self.activation, - conv_states=mamba_cache_params.conv_state, - has_initial_state=mamba2_metadata.has_initial_states, + conv_states=conv_state, + has_initial_state=has_initial_states_p, cache_indices=state_indices_tensor_p, + metadata=mamba2_metadata, query_start_loc=query_start_loc_p) hidden_states_p = hidden_states_p.transpose(0, 1) hidden_states_p = hidden_states_p[:num_prefill_tokens] @@ -299,12 +412,16 @@ class Plamo2MambaMixer(nn.Module): # 3. State Space Model sequence transformation initial_states = None - if (mamba2_metadata.has_initial_states is not None - and mamba2_metadata.prep_initial_states): + if has_initial_states_p is not None and prep_initial_states: # making a copy of the states - initial_states = torch.where( - mamba2_metadata.has_initial_states[:, None, None, None], - mamba_cache_params.ssm_state[state_indices_tensor_p], 0) + if envs.VLLM_USE_V1: + initial_states = torch.where( + has_initial_states_p[:, None, None, None], + ssm_state[state_indices_tensor_p], 0) + else: + initial_states = torch.where( + has_initial_states_p[:num_prefills, None, None, None], + ssm_state[state_indices_tensor_p], 0) varlen_state = mamba_chunk_scan_combined( hidden_states_p.view(1, num_prefill_tokens, self.num_heads // self.tp_size, @@ -313,15 +430,15 @@ class Plamo2MambaMixer(nn.Module): self.A, B.view(1, num_prefill_tokens, 1, -1), C.view(1, num_prefill_tokens, 1, -1), - chunk_size=mamba2_metadata.chunk_size, + chunk_size=chunk_size, D=self.D, z=gate_p.view(1, num_prefill_tokens, self.num_heads // self.tp_size, self.head_dim), dt_bias=self.dt_bias, - seq_idx=mamba2_metadata.seq_idx, - chunk_indices=mamba2_metadata.chunk_indices, - chunk_offsets=mamba2_metadata.chunk_offsets, - cu_seqlens=attn_metadata.query_start_loc[:num_prefills + 1], + seq_idx=seq_idx_p, + chunk_indices=chunk_indices_p, + chunk_offsets=chunk_offsets_p, + cu_seqlens=query_start_loc_p, initial_states=initial_states, return_varlen_states=True, return_final_states=False, @@ -329,18 +446,19 @@ class Plamo2MambaMixer(nn.Module): dt_limit=(0.0, float("inf")), out=preallocated_ssm_out_p.view(1, num_prefill_tokens, -1, self.head_dim), + state_dtype=ssm_state.dtype, ) # update ssm states # - varlen state is a (batch, nheads, headdim, dstate) tensor - mamba_cache_params.ssm_state[state_indices_tensor_p] = varlen_state + ssm_state[state_indices_tensor_p] = varlen_state # Process decode requests if has_decode: # 2. Convolution sequence transformation hidden_states_d = causal_conv1d_update( hidden_states_d, - mamba_cache_params.conv_state, + conv_state, conv_weights, self.conv1d.bias, self.activation, @@ -363,8 +481,10 @@ class Plamo2MambaMixer(nn.Module): # - the hidden is reshaped into (bs, num_heads, head_dim) # - mamba_cache_params.ssm_state's slots will be selected # using state_indices_tensor_d + + # NOTE: final output is an in-place update of out tensor selective_state_update( - mamba_cache_params.ssm_state, + ssm_state, hidden_states_d, dt, A, @@ -378,11 +498,68 @@ class Plamo2MambaMixer(nn.Module): out=preallocated_ssm_out_d.view(num_decodes, -1, self.head_dim), ) - assert self.num_heads % self.tp_size == 0 # 4. Final linear projection - out = self.out_proj(preallocated_ssm_out) - return out + output[:num_actual_tokens] = self.out_proj(preallocated_ssm_out) + + def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]: + assert self.model_config is not None + assert self.cache_config is not None + return MambaStateDtypeCalculator.mamba2_state_dtype( + self.model_config.dtype, + self.cache_config.mamba_cache_dtype, + self.cache_config.mamba_ssm_cache_dtype, + ) + + def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: + return MambaStateShapeCalculator.mamba2_state_shape( + intermediate_size=self.intermediate_size, + tp_world_size=get_tensor_model_parallel_world_size(), + n_groups=0, + num_heads=self.num_heads, + head_dim=self.head_dim, + state_size=self.ssm_state_size, + conv_kernel=self.conv_kernel_size, + ) + + @property + def mamba_type(self) -> str: + return "mamba2" + + def get_attn_backend(self) -> type["AttentionBackend"]: + from vllm.v1.attention.backends.mamba2_attn import ( + Mamba2AttentionBackend) + return Mamba2AttentionBackend + + +def plamo2_mamba_mixer( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self.forward_cuda(hidden_states=hidden_states, + output=output, + mamba_cache_params=None, + mamba2_metadata=None) + + +def plamo2_mamba_mixer_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="plamo2_mamba_mixer", + op_func=plamo2_mamba_mixer, + mutates_args=["output"], + fake_impl=plamo2_mamba_mixer_fake, + dispatch_key=current_platform.dispatch_key, +) class DenseMLP(nn.Module): @@ -418,7 +595,6 @@ class DenseMLP(nn.Module): return self.down_proj(h) -@support_torch_compile class Plamo2AttentionMixer(nn.Module): def __init__(self, @@ -575,12 +751,24 @@ class Plamo2DecoderLayer(nn.Module): hidden_states, residual = self.pre_mixer_norm( hidden_states, residual) + if self.is_mamba: + # Plamo2MambaMixer writes output to this tensor + output = torch.empty_like(hidden_states) + mixer_kwargs = { + "output": output, + "mamba_cache_params": mamba_cache_params, + "mamba2_metadata": mamba2_metadata, + } + else: + mixer_kwargs = { + "positions": positions, + } hidden_states = self.mixer( - positions=positions, hidden_states=hidden_states, - mamba_cache_params=mamba_cache_params, - mamba2_metadata=mamba2_metadata, + **mixer_kwargs, ) + if self.is_mamba: + hidden_states = output hidden_states = self.post_mixer_norm(hidden_states) # Fully Connected hidden_states, residual = self.pre_mlp_norm(hidden_states, residual) @@ -591,7 +779,7 @@ class Plamo2DecoderLayer(nn.Module): class Plamo2Decoder(torch.nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)} @@ -617,7 +805,7 @@ class Plamo2Decoder(torch.nn.Module): mamba_cache_index = 0 for layer in islice(self.layers, self.start_layer, self.end_layer): layer_mamba_cache_params = None - if layer.is_mamba: + if layer.is_mamba and mamba_cache_params is not None: layer_mamba_cache_params = mamba_cache_params.at_layer_idx( mamba_cache_index) mamba_cache_index += 1 @@ -632,10 +820,11 @@ class Plamo2Decoder(torch.nn.Module): return hidden_states, residual -class Plamo2Model(Plamo2PreTrainedModel): +@support_torch_compile +class Plamo2Model(torch.nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config.model_config.hf_config) + super().__init__() config = vllm_config.model_config.hf_config @@ -653,9 +842,9 @@ class Plamo2Model(Plamo2PreTrainedModel): self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) - self.layers = Plamo2Decoder(vllm_config, prefix=f"{prefix}.layers") + self.layers = Plamo2Decoder(vllm_config=vllm_config, + prefix=f"{prefix}.layers") self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_init() def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) @@ -679,11 +868,16 @@ class Plamo2Model(Plamo2PreTrainedModel): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - attn_metadata: AttentionMetadata = get_forward_context().attn_metadata - mamba2_metadata = prepare_mamba2_metadata( - chunk_size=self.config.mamba_chunk_size, - attn_metadata=attn_metadata, - ) + if not envs.VLLM_USE_V1: + attn_metadata: AttentionMetadata = get_forward_context( + ).attn_metadata + mamba2_metadata = prepare_mamba2_metadata( + chunk_size=self.config.mamba_chunk_size, + attn_metadata=attn_metadata, + ) + else: + # v1 get mamba2_metadata from forward_context + mamba2_metadata = None hidden_states, residual = self.layers( positions=positions, @@ -701,8 +895,7 @@ class Plamo2Model(Plamo2PreTrainedModel): return hidden_states -class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP, - IsHybrid, SupportsV0Only): +class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -712,12 +905,10 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP, } def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() config = vllm_config.model_config.hf_config scheduler_config = vllm_config.scheduler_config - assert not vllm_config.cache_config.enable_prefix_caching, \ - "PLaMo2 currently does not support prefix caching" - super().__init__(config) self.config = config self.vllm_config = vllm_config self.model_config = vllm_config.model_config @@ -751,8 +942,6 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP, self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) - # Initialize weights and apply final processing - self.post_init() def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @@ -763,19 +952,27 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): - if self.mamba_cache is None: - num_mamba_layers = self.model_config.get_num_layers_by_block_type( - self.vllm_config.parallel_config, LayerBlockType.mamba) + if not envs.VLLM_USE_V1: + if self.mamba_cache is None: + num_mamba_layers = ( + self.model_config.get_num_layers_by_block_type( + self.vllm_config.parallel_config, + LayerBlockType.mamba)) - self.mamba_cache = MambaCacheManager( - self.vllm_config, - num_mamba_layers, - *self._get_mamba_cache_shape(), - self.lm_head.weight.dtype, - self.lm_head.weight.dtype, - ) + mamba_state_shape = self.get_mamba_state_shape_from_config( + self.vllm_config, use_v1=False) + mamba_state_dtype = \ + self.get_mamba_state_dtype_from_config( + self.vllm_config) + self.mamba_cache = MambaCacheManager(self.vllm_config, + num_mamba_layers, + *mamba_state_shape, + *mamba_state_dtype) - mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) + mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs) + else: + # NOTE: mamba_cache_params is not needed for v1 + mamba_cache_params = None hidden_states = self.model(input_ids, positions, mamba_cache_params, intermediate_tensors, inputs_embeds) @@ -788,21 +985,48 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP, def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) - def _get_mamba_cache_shape( - self) -> tuple[tuple[int, int], tuple[int, int, int]]: - world_size = get_tensor_model_parallel_world_size() - hidden_size = (self.config.mamba_num_heads * - self.config.hidden_size_per_head) - conv_state_shape = ( - hidden_size // world_size, - self.config.mamba_d_conv - 1, + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + + return MambaStateDtypeCalculator.mamba2_state_dtype( + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype, + vllm_config.cache_config.mamba_ssm_cache_dtype, ) - temporal_state_shape = ( - divide(self.config.mamba_num_heads, world_size), - self.config.hidden_size_per_head, - self.config.mamba_d_state, + + @classmethod + def get_mamba_state_shape_from_config( + cls, + vllm_config: "VllmConfig", + use_v1: bool = True, + ) -> tuple[tuple[int, int], tuple[int, int, int]]: + """Calculate shapes for Mamba's convolutional and state caches. + Args: + vllm_config: vLLM config + use_v1: Get shapes for V1 (or V0) + Returns: + Tuple containing: + - conv_state_shape: Shape for convolutional state cache + - temporal_state_shape: Shape for state space model cache + """ + parallel_config = vllm_config.parallel_config + hf_config = vllm_config.model_config.hf_config + intermediate_size =\ + hf_config.mamba_num_heads * hf_config.hidden_size_per_head + + return MambaStateShapeCalculator.mamba2_state_shape( + intermediate_size=intermediate_size, + tp_world_size=parallel_config.tensor_parallel_size, + n_groups=0, + num_heads=hf_config.mamba_num_heads, + head_dim=hf_config.hidden_size_per_head, + state_size=hf_config.mamba_d_state, + conv_kernel=hf_config.mamba_d_conv, + use_v1=use_v1, ) - return conv_state_shape, temporal_state_shape def compute_logits( self, From e9b92dcd89e8d05f162a8fdaa3d5d60012615514 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Wed, 3 Sep 2025 12:35:18 -0400 Subject: [PATCH 60/95] [Kernels] Overlap shared experts with send/recv (#23273) Signed-off-by: Bill Nell --- docs/design/fused_moe_modular_kernel.md | 8 +- examples/offline_inference/data_parallel.py | 8 + tests/kernels/moe/test_pplx_moe.py | 86 ++++++-- tests/kernels/moe/utils.py | 149 ++++++++++++++ .../device_communicators/all2all.py | 7 +- .../base_device_communicator.py | 5 +- .../fused_moe/deepep_ht_prepare_finalize.py | 130 ++++++++---- .../fused_moe/deepep_ll_prepare_finalize.py | 50 ++++- .../flashinfer_cutlass_prepare_finalize.py | 4 +- .../layers/fused_moe/fused_batched_moe.py | 4 +- vllm/model_executor/layers/fused_moe/layer.py | 185 ++++++++++++++---- .../layers/fused_moe/modular_kernel.py | 142 +++++++++++--- .../layers/fused_moe/pplx_prepare_finalize.py | 77 +++++++- .../layers/fused_moe/prepare_finalize.py | 4 +- .../layers/quantization/awq_marlin.py | 4 +- .../layers/quantization/bitsandbytes.py | 2 +- .../compressed_tensors_moe.py | 12 +- .../layers/quantization/experts_int8.py | 4 +- .../model_executor/layers/quantization/fp8.py | 4 +- .../layers/quantization/gguf.py | 4 +- .../layers/quantization/gptq_marlin.py | 2 +- .../layers/quantization/modelopt.py | 4 +- .../layers/quantization/moe_wna16.py | 4 +- .../layers/quantization/mxfp4.py | 4 +- .../layers/quantization/quark/quark_moe.py | 6 +- .../model_executor/layers/quantization/rtn.py | 4 +- .../layers/shared_fused_moe/__init__.py | 6 + .../shared_fused_moe/shared_fused_moe.py | 56 ++++++ vllm/model_executor/models/deepseek_v2.py | 103 ++++++---- vllm/model_executor/models/glm4_moe.py | 2 + vllm/model_executor/models/llama4.py | 29 +-- vllm/v1/worker/gpu_worker.py | 3 +- 32 files changed, 885 insertions(+), 227 deletions(-) create mode 100644 vllm/model_executor/layers/shared_fused_moe/__init__.py create mode 100644 vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md index b03483d1c9b21..cb2037b575e53 100644 --- a/docs/design/fused_moe_modular_kernel.md +++ b/docs/design/fused_moe_modular_kernel.md @@ -54,8 +54,8 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts ### FusedMoEPrepareAndFinalize -The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare` and `finalize` functions. -The `prepare` function is responsible for input activation Quantization and All2All Dispatch. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section) +The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive` and `finalize` functions. +The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers. Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts. The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section) ![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks") @@ -146,6 +146,10 @@ This section describes the significance of the various functions exposed by the `FusedMoEPrepareAndFinalize::prepare()`: The prepare method implements the Quantization and All2All Dispatch. Typically the Dispatch function from the relevant All2All Manager is invoked. +`FusedMoEPrepareAndFinalize::has_prepare_no_receive()`: Indicates whether or not this subclass implements `prepare_no_receive`. Defaults to False. + +`FusedMoEPrepareAndFinalize::prepare_no_receive()`: The prepare_no_receive method implements the Quantization and All2All Dispatch. It does not wait for the result of the dispatch operation but instead returns a thunk that can be invoked to wait for the final results. Typically the Dispatch function from the relevant All2All Manager is invoked. + `FusedMoEPrepareAndFinalize::finalize()`: Maybe perform TopK Weight Application and Reduction and All2All Combine. Typically the Combine function from the relevant All2AllManager is invoked. `FusedMoEPrepareAndFinalize::activation_format()`: Return `FusedMoEActivationFormat.BatchedExperts` if the output of the prepare method (i.e. the All2All dispatch) is Batched. Return `FusedMoEActivationFormat.Standard` otherwise. diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index dd7559451c4c6..36d805a32db7a 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -87,6 +87,11 @@ def parse_args(): default=0.8, help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."), ) + parser.add_argument( + "--compilation-config", + type=int, + help=("Compilation optimization (O) level 0-3."), + ) parser.add_argument( "--quantization", type=str, @@ -106,6 +111,7 @@ def main( trust_remote_code, max_num_seqs, max_model_len, + compilation_config, gpu_memory_utilization, quantization, ): @@ -162,6 +168,7 @@ def main( max_model_len=max_model_len, gpu_memory_utilization=gpu_memory_utilization, quantization=quantization, + compilation_config=compilation_config, ) outputs = llm.generate(prompts, sampling_params) # Print the outputs. @@ -218,6 +225,7 @@ if __name__ == "__main__": args.trust_remote_code, args.max_num_seqs, args.max_model_len, + args.compilation_config, args.gpu_memory_utilization, args.quantization, ), diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index 3f36d7ada2e94..394f521140859 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -4,10 +4,11 @@ Run `pytest tests/kernels/test_pplx_moe.py`. """ +import copy import itertools import textwrap import traceback -from typing import Callable, Optional +from typing import Callable, Optional, Union import pytest import torch @@ -21,7 +22,10 @@ try: except ImportError: has_pplx = False -from tests.kernels.moe.utils import make_test_weights, naive_batched_moe +from tests.kernels.moe.modular_kernel_tools.parallel_utils import ( + _set_vllm_config) +from tests.kernels.moe.utils import (make_shared_experts, make_test_weights, + naive_batched_moe) from tests.kernels.quant_utils import dequant from tests.kernels.utils import torch_experts from vllm.config import VllmConfig, set_current_vllm_config @@ -511,7 +515,8 @@ def pplx_moe( block_shape: Optional[list[int]] = None, use_compile: bool = False, use_cudagraphs: bool = True, -) -> torch.Tensor: + shared_experts: Optional[torch.nn.Module] = None, +) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: num_tokens, hidden_dim = a.shape num_experts = w1.shape[0] @@ -546,6 +551,7 @@ def pplx_moe( fused_experts = FusedMoEModularKernel( prepare_finalize, experts, + shared_experts, ) # Note: workers with the same dp_rank must use the exact same inputs. @@ -586,7 +592,11 @@ def pplx_moe( global_num_experts=num_experts) if use_cudagraphs: - out.fill_(0) + if isinstance(out, tuple): + out[0].fill_(0) + out[1].fill_(0) + else: + out.fill_(0) stream = torch.cuda.Stream() graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph, stream=stream): @@ -626,6 +636,7 @@ def _pplx_moe( per_act_token_quant: bool = False, block_shape: Optional[list[int]] = None, use_internode: bool = False, + shared_experts: Optional[torch.nn.Module] = None, ): try: if use_internode: @@ -666,6 +677,11 @@ def _pplx_moe( with set_current_vllm_config(vllm_config), override_config(moe_config): topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) + if shared_experts is not None: + shared_output = shared_experts(a) + else: + shared_output = None + torch_output = torch_experts( a, w1, @@ -696,7 +712,7 @@ def _pplx_moe( block_shape=block_shape, ) - pplx_output = pplx_moe( + pplx_outputs = pplx_moe( group_name, rank, world_size, @@ -713,8 +729,24 @@ def _pplx_moe( quant_dtype=quant_dtype, per_act_token_quant=per_act_token_quant, block_shape=block_shape, + shared_experts=shared_experts, ) + if shared_experts is None: + pplx_shared_output = None + pplx_output = pplx_outputs + assert isinstance(pplx_output, torch.Tensor) + else: + pplx_shared_output, pplx_output = pplx_outputs + + if shared_output is not None: + assert pplx_shared_output is not None + chunked_shared_output = chunk_by_rank( + shared_output, pgi.rank, + pgi.world_size).to(pplx_shared_output.device) + else: + chunked_shared_output = None + chunked_batch_output = chunk_by_rank( batched_output, pgi.rank, pgi.world_size).to(pplx_output.device) @@ -727,6 +759,15 @@ def _pplx_moe( chunked_batch_output, atol=3e-2, rtol=3e-2) + + if shared_experts is not None: + assert chunked_shared_output is not None + assert pplx_shared_output is not None + torch.testing.assert_close(pplx_shared_output, + chunked_shared_output, + atol=3e-2, + rtol=3e-2) + finally: if use_internode: nvshmem_finalize() @@ -788,7 +829,8 @@ def test_pplx_moe_slow( def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool, - make_weights: bool, test_fn: Callable): + use_shared_experts: bool, make_weights: bool, + test_fn: Callable): def format_result(msg, ex=None): if ex is not None: @@ -803,6 +845,14 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool, else: print(f"PASSED {msg}") + if use_shared_experts: + # Note: this config is only needed for the non-naive shared experts. + new_vllm_config = copy.deepcopy(vllm_config) + new_vllm_config.parallel_config.data_parallel_size = pgi.world_size + new_vllm_config.parallel_config.enable_expert_parallel = True + _set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, + pgi.local_rank) + current_platform.seed_everything(7) combos = itertools.product(PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]]) @@ -819,9 +869,11 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool, use_fp8_w8a8 = False quant_dtype = None - test_desc = (f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, " - f"dtype={dtype}, per_act_token={per_act_token_quant}, " - f"block_shape={block_shape}") + test_desc = ( + f"test_pplx_moe[mnk={mnk}, e={e}, topk={topk}, " + f"dtype={dtype}, per_act_token={per_act_token_quant}, " + f"block_shape={block_shape}, use_internode={use_internode}, " + f"use_shared_experts={use_shared_experts}") if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None): @@ -852,6 +904,14 @@ def _pplx_test_loop(pgi: ProcessGroupInfo, dp_size: int, use_internode: bool, args["w1_s"] = w1_s args["w2_s"] = w2_s + if use_shared_experts: + args["shared_experts"] = make_shared_experts( + n, + k, + in_dtype=a.dtype, + quant_dtype=quant_dtype, + ) + try: test_fn( pgi=pgi, @@ -891,18 +951,20 @@ def test_pplx_prepare_finalize( current_platform.seed_everything(7) world_size, dp_size = world_dp_size parallel_launch(world_size * dp_size, _pplx_test_loop, dp_size, - use_internode, False, _pplx_prepare_finalize) + use_internode, False, False, _pplx_prepare_finalize) @pytest.mark.parametrize("world_dp_size", [[2, 1]]) @pytest.mark.parametrize("use_internode", [False]) +@pytest.mark.parametrize("use_shared_experts", [False, True]) @requires_pplx @multi_gpu_test(num_gpus=2) def test_pplx_moe( world_dp_size: tuple[int, int], use_internode: bool, + use_shared_experts: bool, ): current_platform.seed_everything(7) world_size, dp_size = world_dp_size - parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, True, - _pplx_moe) + parallel_launch(world_size, _pplx_test_loop, dp_size, use_internode, + use_shared_experts, True, _pplx_moe) diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 82960bd57345d..4b58a28eed125 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -8,6 +8,7 @@ import vllm._custom_ops as ops from tests.kernels.quant_utils import per_block_cast_to_int8 from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX) +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts) @@ -282,3 +283,151 @@ def per_token_cast_to_fp8( x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn) return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) + + +# CustomOp? +class BaselineMM(torch.nn.Module): + + def __init__( + self, + b: torch.Tensor, + out_dtype: torch.dtype, + ): + super().__init__() + self.b = b.to(dtype=torch.float32) + self.out_dtype = out_dtype + + def forward( + self, + a: torch.Tensor) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + return torch.mm(a.to(dtype=torch.float32), + self.b).to(self.out_dtype), None + + +class TestMLP(torch.nn.Module): + + def __init__( + self, + w1: torch.Tensor, + w2: torch.Tensor, + out_dtype: torch.dtype, + ): + super().__init__() + self.gate_up_proj = BaselineMM(w1, out_dtype) + self.down_proj = BaselineMM(w2, out_dtype) + self.act_fn = SiluAndMul() + + def forward(self, x): + x, _ = self.gate_up_proj(x) + x = self.act_fn(x) + x, _ = self.down_proj(x) + return x + + +def make_naive_shared_experts( + N: int, + K: int, + in_dtype: torch.dtype = torch.bfloat16, +) -> torch.nn.Module: + w1 = torch.randn((K, N * 2), device="cuda", dtype=in_dtype) / 15 + w2 = torch.randn((N, K), device="cuda", dtype=in_dtype) / 15 + return TestMLP(w1, w2, out_dtype=in_dtype) + + +class RealMLP(torch.nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + w1: torch.Tensor, + w2: torch.Tensor, + hidden_act: str = "silu", + quant_config=None, + reduce_results: bool = True, + prefix: str = "", + w1_s: Optional[torch.Tensor] = None, + w2_s: Optional[torch.Tensor] = None, + ) -> None: + from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, RowParallelLinear) + + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.gate_up_proj.register_parameter( + "weight", torch.nn.Parameter(w1, requires_grad=False)) + self.gate_up_proj.register_parameter( + "weight_scale", torch.nn.Parameter(w1_s, requires_grad=False)) + self.gate_up_proj.register_parameter( + "input_scale", + None) #torch.nn.Parameter(None, requires_grad=False)) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") + self.down_proj.register_parameter( + "weight", torch.nn.Parameter(w2, requires_grad=False)) + self.down_proj.register_parameter( + "weight_scale", torch.nn.Parameter(w2_s, requires_grad=False)) + self.down_proj.register_parameter( + "input_scale", + None) #torch.nn.Parameter(None, requires_grad=False)) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +def make_shared_experts( + N: int, + K: int, + in_dtype: torch.dtype = torch.bfloat16, + quant_dtype: Union[torch.dtype, str, None] = None, +) -> torch.nn.Module: + from vllm.model_executor.layers.quantization.fp8 import Fp8Config + + (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights( + 1, + N, + K, + in_dtype=in_dtype, + quant_dtype=quant_dtype, + ) + old_dtype = torch.get_default_dtype() + try: + torch.set_default_dtype(in_dtype) + if quant_dtype == torch.float8_e4m3fn: + w1 = w1[0].transpose(0, 1) + w2 = w2[0].transpose(0, 1) + w1_s = w1_s[0].transpose(0, 1) if w1_s is not None else None + w2_s = w2_s[0].transpose(0, 1) if w2_s is not None else None + quant_config = Fp8Config(True) + else: + w1 = w1[0] + w2 = w2[0] + w1_s = None + w2_s = None + quant_config = None + + return RealMLP(K, + N, + w1, + w2, + "silu", + quant_config, + w1_s=w1_s, + w2_s=w2_s) + finally: + torch.set_default_dtype(old_dtype) diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 85f87cb21edcd..7c0f30b9aab8c 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, Any +from typing import Any import torch import torch.distributed as dist @@ -13,11 +13,6 @@ from .base_device_communicator import All2AllManagerBase, Cache logger = init_logger(__name__) -if TYPE_CHECKING: - from vllm.model_executor.layers.fused_moe.layer import FusedMoE -else: - FusedMoE = None - class NaiveAll2AllManager(All2AllManagerBase): """ diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 9131582eef754..01f59b44a0e69 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -252,7 +252,10 @@ class DeviceCommunicatorBase: moe_modules = [ module for module in model.modules() - if module.__class__.__name__ == "FusedMoE" + # TODO(bnell): Should use isinstance but can't. Maybe search for + # presence of quant_method.init_prepare_finalize? + if (module.__class__.__name__ == "FusedMoE" + or module.__class__.__name__ == "SharedFusedMoE") ] for module in moe_modules: module.quant_method.init_prepare_finalize(module) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index 437e569d3130d..2bbe523b4bf98 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional +from typing import Callable, Optional, Union import deep_ep import torch @@ -25,6 +25,8 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): self.num_dispatchers_ = num_dispatchers self.dp_size = dp_size self.rank_expert_offset = rank_expert_offset + self.async_prepare = True + # The dispatch function returns a handle that the combine function # requires. We store the handle here so it is available to the # combine function. @@ -56,10 +58,16 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return None return deep_ep.Buffer.get_combine_config(self.dp_size) - def _do_dispatch(self, tokens: torch.Tensor, - token_scales: Optional[torch.Tensor], - rank_topk_ids: torch.Tensor, - rank_topk_weights: torch.Tensor, num_experts: int): + def _do_dispatch( + self, + tokens: torch.Tensor, + token_scales: Optional[torch.Tensor], + rank_topk_ids: torch.Tensor, + rank_topk_weights: torch.Tensor, + num_experts: int, + a1_scale: Optional[torch.Tensor], + quant_config: FusedMoEQuantConfig, + ) -> Callable: has_scales = token_scales is not None @@ -93,9 +101,36 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): expert_alignment=1, config=self._get_dispatch_config(), previous_event=None, - async_finish=False, + async_finish=self.async_prepare, allocate_on_comm_stream=False) + return lambda: self._receiver( + event, + has_scales, + token_data, + expert_topk_ids, + num_experts, + expert_num_tokens_per_expert_list, + expert_topk_weights, + a1_scale, + quant_config, + ) + + def _receiver( + self, + event: deep_ep.EventOverlap, + has_scales: bool, + token_data: Union[tuple[torch.Tensor, torch.Tensor], torch.Tensor], + expert_topk_ids: Optional[torch.Tensor], + num_experts: int, + expert_num_tokens_per_expert_list: list[int], + expert_topk_weights: Optional[torch.Tensor], + a1_scale: Optional[torch.Tensor], + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + if self.async_prepare: + event.current_stream_wait() + if has_scales: expert_x, expert_x_scale = token_data else: @@ -112,6 +147,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): # DeepEP's topk_ids output refers to the local experts directly. Offset # the topk_ids to move it back to the global experts space so it aligns # with existing vLLM interfaces. + assert expert_topk_ids is not None expert_topk_ids = torch.where( expert_topk_ids == -1, num_experts - 1 if self.rank_expert_offset == 0 else 0, @@ -123,10 +159,28 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): expert_tokens_meta = mk.ExpertTokensMetadata.make_from_list( expert_num_tokens_per_expert_list, device=expert_x.device) + # Dispatch and Quant + # DeepEP kernels only support dispatching block-quantized + # activation scales. + # Dispatch in bfloat16 and quantize afterwards + if not quant_config.is_block_quantized: + # Quantize after dispatch. + expert_x_scale = None + if expert_x.numel() != 0: + expert_x, expert_x_scale = moe_kernel_quantize_input( + expert_x, + a1_scale, + quant_dtype=quant_config.quant_dtype, + per_act_token_quant=False, + block_shape=quant_config.block_shape) + return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids, expert_topk_weights) - def prepare( + def supports_async(self) -> bool: + return True + + def prepare_async( self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], @@ -137,9 +191,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], - Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], - Optional[torch.Tensor]]: + ) -> Callable: if apply_router_weight_on_input: topk = topk_ids.size(1) @@ -159,37 +211,37 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): ) if a1q_scale is not None and a1q_scale.numel() == 1: a1q_scale = a1q_scale.view(1, 1) - (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids, - expert_topk_weights) = self._do_dispatch( - tokens=a1q, - token_scales=a1q_scale, - rank_topk_ids=topk_ids, - rank_topk_weights=topk_weights, - num_experts=num_experts) + a1_post_scale = None else: - # Dispatch and Quant - # DeepEP kernels only support dispatching block-quantized - # activation scales. - # Dispatch in bfloat16 - (expert_x, _, expert_tokens_meta, expert_topk_ids, - expert_topk_weights) = self._do_dispatch( - tokens=a1, - token_scales=None, - rank_topk_ids=topk_ids, - rank_topk_weights=topk_weights, - num_experts=num_experts) - # Quantize after dispatch. - expert_x_scale = None - if expert_x.numel() != 0: - expert_x, expert_x_scale = moe_kernel_quantize_input( - expert_x, - a1_scale, - quant_dtype=quant_config.quant_dtype, - per_act_token_quant=False, - block_shape=quant_config.block_shape) + a1q = a1 + a1q_scale = None + a1_post_scale = a1_scale - return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids, - expert_topk_weights) + return self._do_dispatch(tokens=a1q, + token_scales=a1q_scale, + rank_topk_ids=topk_ids, + rank_topk_weights=topk_weights, + num_experts=num_experts, + a1_scale=a1_post_scale, + quant_config=quant_config) + + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + receiver = self.prepare_async(a1, a1_scale, a2_scale, topk_weights, + topk_ids, num_experts, expert_map, + apply_router_weight_on_input, + quant_config) + return receiver() def finalize( self, diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index 93ac11fb4bfbf..1849e49e0ab51 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Optional, Union +from typing import Callable, Optional, Union import deep_ep import torch @@ -75,7 +75,6 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): self, x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], a1_scale: Optional[torch.Tensor], - a2_scale: Optional[torch.Tensor], a1_dtype: torch.dtype, quant_dtype: Union[torch.dtype, str, None], per_act_token_quant: bool, @@ -110,7 +109,10 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return x, x_scales - def prepare( + def supports_async(self) -> bool: + return True + + def prepare_async( self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], @@ -121,9 +123,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], - Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], - Optional[torch.Tensor]]: + ) -> mk.ReceiverType: hidden_size = a1.size(1) assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \ @@ -155,16 +155,48 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): num_experts, use_fp8=self.use_fp8_dispatch, async_finish=False, - return_recv_hook=False) + return_recv_hook=True) + + return lambda: self._receiver(hook, expert_x, expert_num_tokens, + a1_scale, a1.dtype, quant_config) + + def _receiver( + self, + hook: Callable, + expert_x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], + expert_num_tokens: torch.Tensor, + a1_scale, + a1_dtype, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + hook() expert_x, expert_x_scale = self._do_quant( - expert_x, a1_scale, a2_scale, a1.dtype, quant_config.quant_dtype, + expert_x, a1_scale, a1_dtype, quant_config.quant_dtype, quant_config.per_act_token_quant, quant_config.block_shape) expert_tokens_meta = mk.ExpertTokensMetadata( expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None) - return (expert_x, expert_x_scale, expert_tokens_meta, None, None) + return expert_x, expert_x_scale, expert_tokens_meta, None, None + + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + receiver = self.prepare_async(a1, a1_scale, a2_scale, topk_weights, + topk_ids, num_experts, expert_map, + apply_router_weight_on_input, + quant_config) + return receiver() def finalize( self, diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py index 061b02172c446..157cb36d4ffd3 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -56,9 +56,7 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): apply_router_weight_on_input: bool, # TODO(bnell): use quant_config + scales instead of ctor args quant_config: FusedMoEQuantConfig, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], - Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], - Optional[torch.Tensor]]: + ) -> mk.PrepareResultType: if apply_router_weight_on_input: topk = topk_ids.size(1) diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index b46f4be4b912e..88063668e9188 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -506,9 +506,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], - Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], - Optional[torch.Tensor]]: + ) -> mk.PrepareResultType: assert a1.dim() == 2 assert topk_ids.dim() == 2 assert topk_ids.size(0) == a1.size(0) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 3a2c9cbaf459e..b1a61ade53649 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -4,7 +4,7 @@ from abc import abstractmethod from collections.abc import Iterable from enum import Enum -from typing import Callable, Literal, Optional, overload +from typing import Callable, Literal, Optional, Union, overload import torch import torch.nn.functional as F @@ -215,6 +215,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): self.fused_experts = FusedMoEModularKernel( prepare_finalize, experts, + layer.shared_experts, ) def select_gemm_impl( @@ -252,7 +253,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: raise NotImplementedError @@ -409,7 +410,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: if enable_eplb: assert expert_load_view is not None assert logical_to_physical_map is not None @@ -461,7 +462,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, @@ -547,7 +548,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ): + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: if enable_eplb is not False or expert_load_view is not None or \ logical_to_physical_map is not None or \ logical_replica_count is not None: @@ -594,7 +595,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ): + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: if enable_eplb is not False or expert_load_view is not None or \ logical_to_physical_map is not None or \ logical_replica_count is not None: @@ -633,7 +634,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert not use_grouped_topk assert num_expert_group is None assert topk_group is None @@ -948,6 +949,10 @@ class FusedMoE(CustomOp): dtype=moe.in_dtype, device=torch.cuda.current_device()) + @property + def shared_experts(self) -> Optional[torch.nn.Module]: + return None + @property def tp_size(self): return self.moe_parallel_config.tp_size @@ -1400,6 +1405,7 @@ class FusedMoE(CustomOp): return [ weight.view(self.local_num_experts, -1) for name, weight in weights if name not in NON_EXPERT_WEIGHTS + and not name.startswith("_shared_experts.") ] def set_eplb_state( @@ -1582,25 +1588,45 @@ class FusedMoE(CustomOp): else: return tensor_model_parallel_all_reduce(final_hidden_states) - def forward(self, hidden_states: torch.Tensor, - router_logits: torch.Tensor): + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: og_hidden_states = hidden_states.shape[-1] if self.hidden_size != og_hidden_states: hidden_states = F.pad(hidden_states, (0, self.hidden_size - og_hidden_states), mode='constant', value=0.0) - # TODO: Once the OOM issue for the TPU backend is resolved, we will - # switch to using the moe_forward custom op. - if current_platform.is_tpu(): - return self.forward_impl(hidden_states, router_logits) - else: - return torch.ops.vllm.moe_forward( - hidden_states, router_logits, - self.layer_name)[..., :og_hidden_states] - def forward_impl_chunked(self, full_hidden_states: torch.Tensor, - full_router_logits: torch.Tensor): + if self.shared_experts is None: + if current_platform.is_tpu(): + # TODO: Once the OOM issue for the TPU backend is resolved, we + # will switch to using the moe_forward custom op. + fused_output = self.forward_impl(hidden_states, router_logits) + assert not isinstance(fused_output, tuple) + else: + fused_output = torch.ops.vllm.moe_forward( + hidden_states, router_logits, self.layer_name) + return fused_output[..., :og_hidden_states] + else: + if current_platform.is_tpu(): + # TODO: Once the OOM issue for the TPU backend is resolved, we + # will switch to using the moe_forward custom op. + shared_output, fused_output = self.forward_impl( + hidden_states, router_logits) + else: + shared_output, fused_output = torch.ops.vllm.moe_forward_shared( + hidden_states, router_logits, self.layer_name) + return (shared_output[..., :og_hidden_states], + fused_output[..., :og_hidden_states]) + + def forward_impl_chunked( + self, + full_hidden_states: torch.Tensor, + full_router_logits: torch.Tensor, + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.batched_hidden_states is not None assert self.batched_router_logits is not None assert self.batched_hidden_states.dtype == full_hidden_states.dtype @@ -1611,7 +1637,10 @@ class FusedMoE(CustomOp): assert ( self.batched_router_logits.size(-1) == full_router_logits.size(-1)) - full_final_hidden_states = torch.empty_like(full_hidden_states) + full_fused_final_hidden_states = torch.empty_like(full_hidden_states) + if self.shared_experts is not None: + full_shared_final_hidden_states = torch.empty_like( + full_hidden_states) def process_chunk(chunk_start, chunk_end, skip_result_store=False): chunk_size = chunk_end - chunk_start @@ -1652,9 +1681,21 @@ class FusedMoE(CustomOp): logical_replica_count=self.logical_replica_count, ) + assert self.shared_experts is None or isinstance( + final_hidden_states, tuple) + if not skip_result_store: - full_final_hidden_states[chunk_start:chunk_end, :].copy_( - final_hidden_states, non_blocking=True) + if self.shared_experts is None: + full_fused_final_hidden_states[ + chunk_start:chunk_end, :].copy_(final_hidden_states, + non_blocking=True) + else: + full_shared_final_hidden_states[ + chunk_start:chunk_end, :].copy_(final_hidden_states[0], + non_blocking=True) + full_fused_final_hidden_states[ + chunk_start:chunk_end, :].copy_(final_hidden_states[1], + non_blocking=True) ctx = get_forward_context() # flashinfer_cutlass_kernels can handle: optional DP + TP/EP @@ -1675,10 +1716,17 @@ class FusedMoE(CustomOp): chunk_end, skip_result_store=chunk_start_ >= num_tokens) - return full_final_hidden_states + if self.shared_experts is None: + return full_fused_final_hidden_states + else: + return (full_shared_final_hidden_states, + full_fused_final_hidden_states) - def forward_impl(self, hidden_states: torch.Tensor, - router_logits: torch.Tensor): + def forward_impl( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.quant_method is not None # Route to the chunked forward path using the FlashInfer Cutlass kernel # only when data parallelism (DP) is enabled. @@ -1698,6 +1746,15 @@ class FusedMoE(CustomOp): hidden_states, router_logits = get_ep_group().dispatch( hidden_states, router_logits) + # If there are shared experts but we are not using a modular kernel, the + # shared experts must be called here + if (not isinstance(self.quant_method.fused_experts, + FusedMoEModularKernel) + and self.shared_experts is not None): + shared_output = self.shared_experts(hidden_states) + else: + shared_output = None + # Matrix multiply. final_hidden_states = self.quant_method.apply( layer=self, @@ -1722,14 +1779,30 @@ class FusedMoE(CustomOp): logical_replica_count=self.logical_replica_count, ) - if do_naive_dispatch_combine: - final_hidden_states = get_ep_group().combine(final_hidden_states) - if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1): - # Default set to False. (May have to add shared expert outputs. - final_hidden_states = self.maybe_all_reduce_tensor_model_parallel( - final_hidden_states) + if shared_output is not None: + assert not isinstance(final_hidden_states, tuple) + assert self.shared_experts is not None + final_hidden_states = ( + shared_output, + final_hidden_states, + ) - return final_hidden_states + def reduce_output(states: torch.Tensor) -> torch.Tensor: + if do_naive_dispatch_combine: + states = get_ep_group().combine(states) + + if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1): + states = self.maybe_all_reduce_tensor_model_parallel(states) + + return states + + if self.shared_experts is None: + return reduce_output(final_hidden_states) + else: + return ( + reduce_output(final_hidden_states[0]), + reduce_output(final_hidden_states[1]), + ) @classmethod def make_expert_params_mapping( @@ -1784,17 +1857,22 @@ class FusedMoE(CustomOp): return s -def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor, - layer_name: str) -> torch.Tensor: +def moe_forward( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + layer_name: str, +) -> torch.Tensor: forward_context: ForwardContext = get_forward_context() self = forward_context.no_compile_layers[layer_name] - assert self.quant_method is not None - + assert self.shared_experts is None return self.forward_impl(hidden_states, router_logits) -def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor, - layer_name: str) -> torch.Tensor: +def moe_forward_fake( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + layer_name: str, +) -> torch.Tensor: return torch.empty_like(hidden_states) @@ -1807,6 +1885,37 @@ direct_register_custom_op( tags=(torch.Tag.needs_fixed_stride_order, ), ) + +def moe_forward_shared( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + layer_name: str, +) -> tuple[torch.Tensor, torch.Tensor]: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + assert self.shared_experts is not None + return self.forward_impl(hidden_states, router_logits) + + +def moe_forward_shared_fake( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + layer_name: str, +) -> tuple[torch.Tensor, torch.Tensor]: + shared_out = torch.empty_like(hidden_states) + fused_out = torch.empty_like(hidden_states) + return shared_out, fused_out + + +direct_register_custom_op( + op_name="moe_forward_shared", + op_func=moe_forward_shared, + mutates_args=["hidden_states"], + fake_impl=moe_forward_shared_fake, + dispatch_key=current_platform.dispatch_key, + tags=(torch.Tag.needs_fixed_stride_order, ), +) + # Mark the FusedMoE weight_loader as supporting MoE-specific parameters # to avoid expensive runtime reflection in model loading code FusedMoE.weight_loader.supports_moe_loading = True # type: ignore[attr-defined] diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 2ea6383d5ae90..7a8c6f8571deb 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum from math import prod -from typing import Optional, final +from typing import Callable, Optional, Union, final import torch @@ -141,6 +141,29 @@ class TopKWeightAndReduce(ABC): raise NotImplementedError +# +# PrepareResultType is a tuple of: +# - quantized + dispatched a. +# - quantized + dispatched a1_scales. +# - Optional ExpertTokensMetadata containing gpu/cpu tensors +# as big as the number of local experts with the information about the +# number of tokens assigned to each local expert. +# - Optional dispatched expert topk IDs +# - Optional dispatched expert topk weight +# +# See `prepare` method below. +# +PrepareResultType = tuple[ + torch.Tensor, + Optional[torch.Tensor], + Optional[ExpertTokensMetadata], + Optional[torch.Tensor], + Optional[torch.Tensor], +] + +ReceiverType = Callable[[], PrepareResultType] + + # TODO: pass FusedMoEParallelConfig in as ctor parameter? class FusedMoEPrepareAndFinalize(ABC): """ @@ -160,16 +183,9 @@ class FusedMoEPrepareAndFinalize(ABC): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - ) -> tuple[ - torch.Tensor, - Optional[torch.Tensor], - Optional[ExpertTokensMetadata], - Optional[torch.Tensor], - Optional[torch.Tensor], - ]: + ) -> PrepareResultType: """ - Perform any quantization (and/or) dispatching needed - for this kernel. + Perform any quantization (and/or) dispatching needed for this kernel. - a1: The (unquantized) input to the MoE layer. - a1_scale: Optional scales for a1 - a2_scale: Optional scales for the second MoE gemm. Required to make @@ -193,6 +209,51 @@ class FusedMoEPrepareAndFinalize(ABC): """ raise NotImplementedError + def supports_async(self) -> bool: + """ + Indicates whether or not this class implements prepare_async. + """ + return False + + def prepare_async( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> ReceiverType: + """ + Perform any quantization (and/or) dispatching needed for this kernel + but do not wait for results from other workers. + - a1: The (unquantized) input to the MoE layer. + - a1_scale: Optional scales for a1 + - a2_scale: Optional scales for the second MoE gemm. Required to make + sure the quantization is consistent for both gemms. + - topk_ids: The topk ids. + - topk_weights: The topk weights. + - num_experts: The total number of experts in the global expert space. + - expert_map: A tensor mapping expert indices from the global expert + space to the local expert space of the expert parallel shard. + - apply_router_weight_on_input: When True, apply the weights to the + activations, before quantization + dispatching. + + Returns a callback that when invoked waits for results from other + workers and has the same return signature as `prepare`, e.g. + + receiver = obj.prepare_async(...) + a, a_scales, expert_meta, topk_ids, topk_weights = receiver() + + is equivalent to: + + a, a_scales, expert_meta, topk_ids, topk_weights = obj.prepare(...) + """ + raise NotImplementedError + @abstractmethod def finalize( self, @@ -453,10 +514,12 @@ class FusedMoEModularKernel(torch.nn.Module): self, prepare_finalize: FusedMoEPrepareAndFinalize, fused_experts: FusedMoEPermuteExpertsUnpermute, + shared_experts: Optional[torch.nn.Module] = None, ): super().__init__() self.prepare_finalize = prepare_finalize self.fused_experts = fused_experts + self.shared_experts = shared_experts assert prepare_finalize.activation_format == \ fused_experts.activation_formats[0], ( f"{prepare_finalize.__class__.__name__}." @@ -692,7 +755,7 @@ class FusedMoEModularKernel(torch.nn.Module): a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: """ This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism. @@ -736,18 +799,46 @@ class FusedMoEModularKernel(torch.nn.Module): if global_num_experts == -1: global_num_experts = local_num_experts - (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids, - _expert_topk_weights) = self.prepare_finalize.prepare( - a1, - a1_scale, - a2_scale, - topk_weights, - topk_ids, - global_num_experts, - expert_map, - apply_router_weight_on_input, - self.fused_experts.quant_config, - ) + shared_output: torch.Tensor + + if (not self.prepare_finalize.supports_async() + or self.shared_experts is None): + + # Run shared experts serially with dispatch. + if self.shared_experts is not None: + shared_output = self.shared_experts(a1) + + (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids, + _expert_topk_weights) = self.prepare_finalize.prepare( + a1, + a1_scale, + a2_scale, + topk_weights, + topk_ids, + global_num_experts, + expert_map, + apply_router_weight_on_input, + self.fused_experts.quant_config, + ) + else: + # Overlap shared expert compute with all2all dispatch. + receiver = self.prepare_finalize.prepare_async( + a1, + a1_scale, + a2_scale, + topk_weights, + topk_ids, + global_num_experts, + expert_map, + apply_router_weight_on_input, + self.fused_experts.quant_config, + ) + + assert self.shared_experts is not None + shared_output = self.shared_experts(a1) + + (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids, + _expert_topk_weights) = receiver() # Maybe prepare gathered topk_ids and topk_weights from other EP ranks. topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids @@ -795,4 +886,7 @@ class FusedMoEModularKernel(torch.nn.Module): self.fused_experts.finalize_weight_and_reduce_impl(), ) - return output + if self.shared_experts is None: + return output + else: + return shared_output, output diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 401f37922b7bb..2ae79e69f5554 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -84,12 +84,15 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return self.max_num_tokens def topk_indices_dtype(self) -> Optional[torch.dtype]: - return torch.int32 + return torch.uint32 def num_dispatchers(self) -> int: return self.num_dispatchers_ - def prepare( + def supports_async(self) -> bool: + return True + + def prepare_async( self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor], @@ -100,9 +103,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], - Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], - Optional[torch.Tensor]]: + ) -> mk.ReceiverType: num_tokens = a1.size(0) # M hidden_dim = a1.size(-1) # K @@ -138,6 +139,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): _validate_scale_shape(a1q, a1q_scale, quant_config.per_act_token_quant, quant_config.block_shape) + orig_a_scale_block_shape: Optional[int] = None + if a1q_scale is not None: scalar_scales = a1q_scale.numel() == 1 @@ -205,8 +208,45 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): out_expert_x_scale=expert_x_scale, dp_x=a1q, dp_x_scale=a1q_scale, - indices=topk_ids.view(dtype=torch.uint32), + indices=topk_ids, bound_m=bound_m, + do_send=True, + do_recv=False, + ) + + return lambda: self._receiver( + expert_num_tokens, + expert_x, + expert_x_scale, + a1q, + a1q_scale, + topk_ids, + bound_m, + orig_a_scale_block_shape, + ) + + def _receiver( + self, + expert_num_tokens: torch.Tensor, + expert_x: torch.Tensor, + expert_x_scale: Optional[torch.Tensor], + a1q: torch.Tensor, + a1q_scale: Optional[torch.Tensor], + topk_ids: torch.Tensor, + bound_m: Optional[torch.Tensor], + orig_a_scale_block_shape: Optional[int], + ) -> mk.PrepareResultType: + + self.a2a.dispatch( + out_expert_num_tokens=expert_num_tokens, + out_expert_x=expert_x, + out_expert_x_scale=expert_x_scale, + dp_x=a1q, + dp_x_scale=a1q_scale, + indices=topk_ids, + bound_m=bound_m, + do_send=False, + do_recv=True, ) if expert_x_scale is not None: @@ -218,6 +258,31 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): return expert_x, expert_x_scale, expert_tokens_meta, None, None + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + quant_config: FusedMoEQuantConfig, + ) -> mk.PrepareResultType: + receiver = self.prepare_async( + a1, + a1_scale, + a2_scale, + topk_weights, + topk_ids, + num_experts, + expert_map, + apply_router_weight_on_input, + quant_config, + ) + return receiver() + def finalize( self, output: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index 567a0a88fec0a..bd9f7d4a06b17 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -38,9 +38,7 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, quant_config: FusedMoEQuantConfig, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], - Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor], - Optional[torch.Tensor]]: + ) -> mk.PrepareResultType: if apply_router_weight_on_input: topk = topk_ids.size(1) diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 8293d42ef4556..bf99f0823b745 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, Union import torch from torch.nn import Parameter @@ -505,7 +505,7 @@ class AWQMoEMethod(FusedMoEMethodBase): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.fused_experts is None if enable_eplb: diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 9713757df9b07..2245c59af6fea 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -474,7 +474,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: from vllm.model_executor.layers.fused_moe import fused_experts assert self.fused_experts is None diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index e4585419226cd..c2b884c058d3a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -3,7 +3,7 @@ import enum from enum import Enum -from typing import Callable, Optional +from typing import Callable, Optional, Union import torch from compressed_tensors import CompressionFormat @@ -358,7 +358,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.fused_experts is None if enable_eplb: @@ -819,7 +819,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: if enable_eplb: raise NotImplementedError( "EPLB not supported for " @@ -1069,7 +1069,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.fused_experts is None if enable_eplb: @@ -1375,7 +1375,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.fused_experts is None if enable_eplb: @@ -1608,7 +1608,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.fused_experts is None if enable_eplb: diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 2d8a684bc7d90..b361fe9bea088 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, Union import torch @@ -128,7 +128,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.fused_experts is None if enable_eplb: diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index d9e01dcf40d5a..de22cceb45d1e 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, Any, Callable, Optional +from typing import TYPE_CHECKING, Any, Callable, Optional, Union import torch import torch.nn.functional as F @@ -988,7 +988,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: if enable_eplb: assert expert_load_view is not None assert logical_to_physical_map is not None diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index ad648df238194..01af1ccd9ae06 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, Union import gguf import torch @@ -540,7 +540,7 @@ class GGUFMoEMethod(FusedMoEMethodBase): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ): + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.fused_experts is None if enable_eplb: diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 3644d91f64e3c..cf959e13bc45c 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -654,7 +654,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.fused_experts is None if enable_eplb: diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 4bb8438d90844..e140807879177 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -491,7 +491,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: if enable_eplb: raise NotImplementedError( "EPLB not supported for `ModelOptFp8MoEMethod` yet.") @@ -1366,7 +1366,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ): + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: if enable_eplb: raise NotImplementedError( "EPLB not supported for `ModelOptNvFp4FusedMoE` yet.") diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index fb3e4b518bf6c..d6d7ec9b15805 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, Union import torch @@ -305,7 +305,7 @@ class MoeWNA16Method(FusedMoEMethodBase): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.fused_experts is None if enable_eplb: raise NotImplementedError( diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 85d05ff51daa1..889c15df3c878 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Callable, Optional +from typing import Callable, Optional, Union import torch from torch.nn.parameter import Parameter @@ -554,7 +554,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: if enable_eplb: raise NotImplementedError("EPLB is not supported for mxfp4") diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index fdf03ded04480..6cff9f3019d34 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, Union import torch @@ -226,7 +226,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.fused_experts is None if enable_eplb: @@ -390,7 +390,7 @@ class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.fused_experts is None if enable_eplb: diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index 8f72b8cbea7a7..0d5fa05652b80 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -3,7 +3,7 @@ # Copyright © 2025, Oracle and/or its affiliates. import os -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, Union import torch import torch.nn.functional as F @@ -291,7 +291,7 @@ class RTNMoEMethod(FusedMoEMethodBase): expert_load_view: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert self.fused_experts is None if enable_eplb: diff --git a/vllm/model_executor/layers/shared_fused_moe/__init__.py b/vllm/model_executor/layers/shared_fused_moe/__init__.py new file mode 100644 index 0000000000000..b87c69d3edd04 --- /dev/null +++ b/vllm/model_executor/layers/shared_fused_moe/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.model_executor.layers.shared_fused_moe.shared_fused_moe import ( + SharedFusedMoE) + +__all__ = ["SharedFusedMoE"] diff --git a/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py new file mode 100644 index 0000000000000..e1e3d188d9852 --- /dev/null +++ b/vllm/model_executor/layers/shared_fused_moe/shared_fused_moe.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import torch + +from vllm.distributed import tensor_model_parallel_all_reduce +from vllm.model_executor.layers.fused_moe.layer import FusedMoE + + +# TODO(bnell): Add shared + fused combo function? e.g. + +class SharedFusedMoE(FusedMoE): + """ + A FusedMoE operation that also computes the results of shared experts. + If an all2all communicator is being used the shared expert computation + can be interleaved with the fused all2all dispatch communication step. + """ + + def __init__( + self, + shared_experts: torch.nn.Module, + use_overlapped: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self._shared_experts = shared_experts + self.use_overlapped = use_overlapped + + @property + def shared_experts(self) -> Optional[torch.nn.Module]: + return self._shared_experts if self.use_overlapped else None + + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + if not self.use_overlapped: + shared_out = self._shared_experts(hidden_states) + + # Reduce outputs if necessary, since the MLP should + # have been created with reduce_results=False. + if (self.reduce_results and self.tp_size > 1 + and self.must_reduce_shared_expert_outputs()): + shared_out = tensor_model_parallel_all_reduce(shared_out) + + fused_out = super().forward( + hidden_states=hidden_states, + router_logits=router_logits, + ) + else: + shared_out, fused_out = super().forward( + hidden_states=hidden_states, + router_logits=router_logits, + ) + return shared_out, fused_out diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 3a8eaf681733d..7db6fc5d8ad89 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -49,6 +49,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -147,63 +148,85 @@ class DeepseekV2MoE(nn.Module): self.physical_expert_end = (self.physical_expert_start + self.n_local_physical_experts) - self.experts = FusedMoE( - num_experts=config.n_routed_experts, - top_k=config.num_experts_per_tok, - hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - reduce_results=False, - renormalize=config.norm_topk_prob, - quant_config=quant_config, - use_grouped_topk=True, - num_expert_group=config.n_group, - topk_group=config.topk_group, - prefix=f"{prefix}.experts", - scoring_func=config.scoring_func, - # we do scaling outside, set factor to 1.0 to avoid double mul - routed_scaling_factor=1.0, - e_score_correction_bias=self.gate.e_score_correction_bias, - enable_eplb=self.enable_eplb, - num_redundant_experts=self.n_redundant_experts) - - if config.n_shared_experts is not None: + if config.n_shared_experts is None: + self.experts = FusedMoE( + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func=config.scoring_func, + # we do scaling outside, set factor to 1.0 to avoid double mul + routed_scaling_factor=1.0, + e_score_correction_bias=self.gate.e_score_correction_bias, + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts) + self.shared_experts = None + else: intermediate_size = (config.moe_intermediate_size * config.n_shared_experts) + self.shared_experts = DeepseekV2MLP( hidden_size=config.hidden_size, intermediate_size=intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, - reduce_results=self.experts.must_reduce_shared_expert_outputs( - ), + reduce_results=False, prefix=f"{prefix}.shared_experts", ) + self.experts = SharedFusedMoE( + shared_experts=self.shared_experts, + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func=config.scoring_func, + # we do scaling outside, set factor to 1.0 to avoid double mul + routed_scaling_factor=1.0, + e_score_correction_bias=self.gate.e_score_correction_bias, + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts) + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: num_tokens, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) - if self.n_shared_experts is not None: - shared_output = self.shared_experts(hidden_states) # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - if hidden_states.dtype != torch.float16: - final_hidden_states = self.experts( - hidden_states=hidden_states, - router_logits=router_logits) * self.routed_scaling_factor + fused_moe_out = self.experts(hidden_states=hidden_states, + router_logits=router_logits) + + if self.shared_experts is not None: + shared_output, final_hidden_states = fused_moe_out else: - # Fix FP16 overflow - # See DeepseekV2DecoderLayer for more details. - final_hidden_states = self.experts(hidden_states=hidden_states, - router_logits=router_logits) - if shared_output is not None: - if hidden_states.dtype != torch.float16: - final_hidden_states = final_hidden_states + shared_output - else: - # Fix FP16 overflow - # See DeepseekV2DecoderLayer for more details. - final_hidden_states = final_hidden_states + shared_output \ - * (1. / self.routed_scaling_factor) + shared_output = None + final_hidden_states = fused_moe_out + + # Fix FP16 overflow + # See DeepseekV2DecoderLayer for more details. + if hidden_states.dtype != torch.float16: + final_hidden_states *= self.routed_scaling_factor + elif self.shared_experts is not None: + assert shared_output is not None + shared_output *= (1. / self.routed_scaling_factor) + + if self.shared_experts is not None: + assert shared_output is not None + final_hidden_states += shared_output if self.tp_size > 1: final_hidden_states = ( diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 284506b642d66..1fb4576092892 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -184,6 +184,8 @@ class Glm4MoE(nn.Module): if self.n_shared_experts is not None: shared_output = self.shared_experts(hidden_states) + else: + shared_output = None router_logits = self.gate(hidden_states.to(dtype=torch.float32)) final_hidden_states = self.experts( hidden_states=hidden_states, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index ba08e6f81f7fe..ddd7e6a5936e3 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -36,6 +36,7 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) @@ -73,7 +74,18 @@ class Llama4MoE(nn.Module): quant_config=None, prefix=f"{prefix}.router") - self.experts = FusedMoE( + self.shared_expert = LlamaMLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size_moe, + hidden_act="silu", + quant_config=quant_config, + bias=False, + prefix=f"{prefix}.shared_expert", + reduce_results=False, + ) + + self.experts = SharedFusedMoE( + shared_experts=self.shared_expert, num_experts=config.num_local_experts, top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, @@ -83,22 +95,13 @@ class Llama4MoE(nn.Module): reduce_results=False, renormalize=False, quant_config=quant_config, - prefix=f"{prefix}.experts") - - self.shared_expert = LlamaMLP( - hidden_size=config.hidden_size, - intermediate_size=intermediate_size_moe, - hidden_act="silu", - quant_config=quant_config, - bias=False, - prefix=f"{prefix}.shared_expert", - reduce_results=self.experts.must_reduce_shared_expert_outputs(), + prefix=f"{prefix}.experts", ) def forward(self, hidden_states): router_logits, _ = self.router(hidden_states) - shared_out = self.shared_expert(hidden_states) - routed_out = self.experts( + + shared_out, routed_out = self.experts( hidden_states=hidden_states, router_logits=router_logits, ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index cb000d53a923d..affba877ecf92 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -500,7 +500,8 @@ class Worker(WorkerBase): parallel_config = self.vllm_config.parallel_config moe_modules = [ module for module in self.model_runner.model.modules() - if module.__class__.__name__ == "FusedMoE" + if (module.__class__.__name__ == "FusedMoE" + or module.__class__.__name__ == "SharedFusedMoE") ] num_local_experts = moe_modules[0].moe_config.num_local_experts assert all(module.moe_config.num_local_experts == num_local_experts From 731a6940e39e84619bbc8db8a794563bb8cc61a5 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Wed, 3 Sep 2025 11:04:00 -0700 Subject: [PATCH 61/95] Migrate whisper inputs to TensorSchema (#23505) Signed-off-by: Benji Beck --- vllm/model_executor/models/whisper.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 848b6e0f8093a..97e8cd6e76957 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -4,7 +4,7 @@ import math from collections.abc import Iterable, Mapping, Sequence from contextlib import nullcontext -from typing import Literal, Optional, TypedDict, Union, cast +from typing import Annotated, Literal, Optional, Union, cast import numpy as np import torch @@ -40,6 +40,7 @@ from vllm.multimodal.processing import (BaseProcessingInfo, PromptReplacement, PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.transformers_utils.processor import cached_get_processor +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsTranscription, SupportsV0Only) @@ -111,9 +112,16 @@ ISO639_1_SUPPORTED_LANGS = { } -class WhisperAudioInputs(TypedDict): - input_features: NestedTensors - """Shape: `(batch_size, 128, M)`""" +class WhisperAudioInputs(TensorSchema): + """ + Dimensions: + - b: Batch size + - nmb: Number of mel bins + - t: Time frames (M) + """ + + input_features: Annotated[Optional[NestedTensors], + TensorShape("b", "nmb", "t")] class WhisperPositionalEmbedding(nn.Embedding): From a742322092379a25d83300a4d44abc7cf225a700 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 3 Sep 2025 14:05:24 -0400 Subject: [PATCH 62/95] [Attention] Blackwell FP8 MLA support with CUTLASS_MLA backend (#23289) Signed-off-by: Matthew Bonanni --- .../attention/mla/sm100_cutlass_mla_kernel.cu | 16 +- tests/kernels/test_cutlass_mla_decode.py | 246 ++++++++++++------ vllm/platforms/cuda.py | 4 +- vllm/v1/attention/backends/mla/cutlass_mla.py | 23 +- 4 files changed, 184 insertions(+), 105 deletions(-) diff --git a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu index 6dd6f269f3dc9..820bf81dd1a02 100644 --- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu +++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu @@ -64,11 +64,11 @@ struct IsPersistent { static const bool value = v; }; -template > +template > struct MlaSm100 { using Element = T; using ElementAcc = float; - using ElementOut = T; + using ElementOut = TOut; using TileShape = Shape<_128, _128, Shape<_512, _64>>; using TileShapeH = cute::tuple_element_t<0, TileShape>; @@ -178,7 +178,7 @@ typename T::Fmha::Arguments args_from_options( return arguments; } -template +template void runMla( at::Tensor const& out, at::Tensor const& q_nope, @@ -190,7 +190,7 @@ void runMla( double sm_scale, int64_t num_kv_splits, cudaStream_t stream) { - using MlaSm100Type = MlaSm100; + using MlaSm100Type = MlaSm100; typename MlaSm100Type::Fmha fmha; auto arguments = args_from_options(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits); @@ -233,13 +233,13 @@ void sm100_cutlass_mla_decode( DISPATCH_BOOL(page_size == 128, IsPaged128, [&] { DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] { if (in_dtype == at::ScalarType::Half) { - runMla>( + runMla>( out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); } else if (in_dtype == at::ScalarType::BFloat16) { - runMla>( + runMla>( out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); } else if (in_dtype == at::ScalarType::Float8_e4m3fn) { - runMla>( + runMla>( out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream); } else { TORCH_CHECK(false, "Unsupported input data type of MLA"); @@ -253,7 +253,7 @@ void sm100_cutlass_mla_decode( int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) { // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc) // which are float, so Element type here doesn't matter. - using MlaSm100Type = MlaSm100; + using MlaSm100Type = MlaSm100; // Get split kv. Requires problem shape and sm_count only. typename MlaSm100Type::Fmha::Arguments arguments; diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py index 2b745b84dae6c..85984324b1967 100644 --- a/tests/kernels/test_cutlass_mla_decode.py +++ b/tests/kernels/test_cutlass_mla_decode.py @@ -1,96 +1,180 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math +import random + import pytest import torch -import torch.nn.functional as F -from torch import Tensor import vllm._custom_ops as ops from vllm.platforms import current_platform - -if not current_platform.has_device_capability(100): - pytest.skip( - reason="Cutlass MLA Requires compute capability of 10 or above.", - allow_module_level=True) +from vllm.triton_utils import triton -def ref_mla( - out: Tensor, # (bs, num_heads, v_head_dim) - query: Tensor, # (bs, num_heads, head_dim) - kv_cache: Tensor, # (num_blocks, block_size, head_dim) - scale: float, - block_tables: Tensor, # (bs, max_num_blocks) - seq_lens: Tensor, # (bs,) -): - bs, num_heads, v_head_dim = out.shape - head_dim = query.shape[2] - - for i in range(bs): - # gather and flatten KV-cache - kv = kv_cache[ - block_tables[i]] # (max_num_blocks, block_size, head_dim) - kv = kv.view(1, -1, - head_dim)[:, :seq_lens[i]] # (1, seq_len, head_dim) - v = kv[:, :, :v_head_dim] - - q = query[i].view(num_heads, 1, head_dim) - o = F.scaled_dot_product_attention(q, - kv, - v, - scale=scale, - enable_gqa=True) - out[i] = o.view(num_heads, v_head_dim) - - return out - - -@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) -@pytest.mark.parametrize("mean_seq_len", [128, 1024, 4096]) -@pytest.mark.parametrize("bs", [1, 2, 4]) -@pytest.mark.parametrize("varlen", [False, True]) -@pytest.mark.parametrize("block_size", [16, 64, 128]) -def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int, - varlen: bool, block_size: int): - torch.set_default_dtype(dtype) - torch.set_default_device('cuda') - torch.manual_seed(42) - - d = 576 - h_q = 128 - dv = 512 - - q_nope_dim = 128 - q_pe_dim = 64 - scale = (q_nope_dim + q_pe_dim)**(-0.5) - if varlen: - seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2) - seq_lens = seq_lens.clip(2).to(torch.int32) +def cal_diff(x: torch.Tensor, + y: torch.Tensor, + name: str, + use_fp8: bool = False) -> None: + x, y = x.double(), y.double() + cos_diff = 1 - 2 * (x * y).sum().item() / max( + (x * x + y * y).sum().item(), 1e-12) + if (use_fp8): + assert cos_diff < 1e-4 else: - seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32) - max_seq_len = seq_lens.max().item() - block_num = (max_seq_len + block_size - 1) // block_size + assert cos_diff < 1e-5 - # Pad block_num so that small blocks can be packed into full 128-sized - # CUTLASS tiles. One 128-wide tile can hold (128 // block_size) small - # blocks. - pack_factor = 128 // block_size - block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor - # Amplify input values to ensure test coverage of edge cases where CUTLASS - # kernel errors occur with split_k settings. - q = torch.randn(bs, h_q, d) * 100 - block_table = torch.randint(0, - bs * block_num, (bs, block_num), - dtype=torch.int32) +CUTLASS_MLA_UNSUPPORTED_REASON = \ + "Cutlass MLA Requires compute capability of 10 or above." \ + if not current_platform.is_device_capability(100) \ + else "Cutlass MLA is supported" - kv_cache = torch.randn(block_table.numel(), block_size, d) - out_ref = q.new_zeros(bs, h_q, dv) - ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens) - out_ans = torch.zeros_like(out_ref) - q_nope = q[:, :, :dv].clone() - q_pe = q[:, :, dv:].clone() - ops.cutlass_mla_decode(out_ans, q_nope, q_pe, kv_cache, seq_lens, - block_table, scale) +@pytest.mark.skipif(not current_platform.has_device_capability(100), + reason=CUTLASS_MLA_UNSUPPORTED_REASON) +@pytest.mark.parametrize("b", [128]) +@pytest.mark.parametrize("s_q", [1]) +@pytest.mark.parametrize("mean_sk", [4096, 8192, 16384]) +@pytest.mark.parametrize("h_q", [16, 32, 64, 128]) +@pytest.mark.parametrize("h_kv", [1]) +@pytest.mark.parametrize("d", [576]) +@pytest.mark.parametrize("dv", [512]) +@pytest.mark.parametrize("block_size", [64]) +@pytest.mark.parametrize("causal", [True]) +@pytest.mark.parametrize("varlen", [False, True]) +@pytest.mark.parametrize("torch_dtype", [torch.bfloat16, torch.float8_e4m3fn]) +@torch.inference_mode() +def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, + causal, varlen, torch_dtype): + device = torch.device("cuda:0") + if torch_dtype == torch.float8_e4m3fn: + init_dtype = torch.bfloat16 + else: + init_dtype = torch_dtype + torch.set_default_dtype(init_dtype) + torch.set_default_device(device) + torch.cuda.set_device(device) + torch.manual_seed(42) + random.seed(42) - torch.testing.assert_close(out_ans, out_ref, atol=1e-2, rtol=1e-2) + print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, " + f"{d=}, {dv=}, {causal=}, {varlen=}, {torch_dtype=}") + + use_fp8 = torch_dtype == torch.float8_e4m3fn + scale = math.sqrt(d)**(-1) + cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32) + if varlen: + for i in range(b): + cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2), + s_q) + total_seqlens = cache_seqlens.sum().item() + max_seqlen = cache_seqlens.max().item() + max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256 + + q = torch.randn(b, s_q, h_q, d) + block_table = torch.arange(b * max_seqlen_pad // block_size, + dtype=torch.int32).view( + b, max_seqlen_pad // block_size) + blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d) + blocked_v = blocked_k[..., :dv] + + init_dtype = q.dtype + if use_fp8: + fp8_dtype = torch.float8_e4m3fn + descale_q = torch.ones((1), dtype=torch.float32) + descale_k = torch.ones((1), dtype=torch.float32) + + q = q.to(fp8_dtype) + blocked_k = blocked_k.to(fp8_dtype) + blocked_v = blocked_v.to(fp8_dtype) + else: + descale_q = None + descale_k = None + + def cutlass_mla(): + MAX_HEADS = 128 + + q_reshaped = q.squeeze(1) + q_nope = q_reshaped[:, :, :dv].clone() + q_pe = q_reshaped[:, :, dv:].clone() + + if h_q < MAX_HEADS: + q_nope_padded = q_nope.new_empty((b, MAX_HEADS, dv)) + q_nope_padded[:, :h_q] = q_nope + q_nope = q_nope_padded + + q_pe_padded = q_pe.new_empty((b, MAX_HEADS, d - dv)) + q_pe_padded[:, :h_q] = q_pe + q_pe = q_pe_padded + + kv_cache_flat = blocked_k.squeeze(2) + device_properties = torch.cuda.get_device_properties( + torch.device("cuda:0")) + sm_count = device_properties.multi_processor_count + workspace_size = ops.sm100_cutlass_mla_get_workspace_size( + max_seqlen * block_size, b, sm_count, num_kv_splits=1) + workspace = torch.empty(workspace_size, + device="cuda", + dtype=torch.uint8) + + out_ans = torch.empty(b, MAX_HEADS, dv, dtype=init_dtype) + + ops.sm100_cutlass_mla_decode(out_ans, q_nope, q_pe, kv_cache_flat, + cache_seqlens, block_table, workspace, + scale, 1) + return out_ans[:, :h_q].contiguous() + + def scaled_dot_product_attention(query, key, value, is_causal=False): + query = query.float() + key = key.float() + value = value.float() + key = key.repeat_interleave(h_q // h_kv, dim=0) + value = value.repeat_interleave(h_q // h_kv, dim=0) + attn_weight = query @ key.transpose(-2, -1) / math.sqrt(query.size(-1)) + if is_causal: + s_q = query.shape[-2] + s_k = key.shape[-2] + attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype) + temp_mask = torch.ones(s_q, s_k, + dtype=torch.bool).tril(diagonal=s_k - s_q) + attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf")) + attn_bias.to(query.dtype) + attn_weight += attn_bias + lse = attn_weight.logsumexp(dim=-1) + attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32) + return attn_weight @ value, lse + + def ref_mla(): + q_ = (q.to(torch.float) * descale_q).to(init_dtype) if use_fp8 else q + blocked_k_ = (blocked_k.to(torch.float) * + descale_k).to(init_dtype) if use_fp8 else blocked_k + blocked_v_ = (blocked_v.to(torch.float) * + descale_k).to(init_dtype) if use_fp8 else blocked_v + out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32) + lse = torch.empty(b, h_q, s_q, dtype=torch.float32) + for i in range(b): + begin = i * max_seqlen_pad + end = begin + cache_seqlens[i] + out_i, lse_i = scaled_dot_product_attention( + q_[i].transpose(0, 1), + blocked_k_.view(-1, h_kv, d)[begin:end].transpose(0, 1), + blocked_v_.view(-1, h_kv, dv)[begin:end].transpose(0, 1), + is_causal=causal, + ) + out[i] = out_i.transpose(0, 1) + lse[i] = lse_i + return out, lse + + out_cutlass = cutlass_mla() + out_torch, lse_torch = ref_mla() + # Extract the single token (s_q=1) slice to match cutlass output shape + out_torch_slice = out_torch[:, 0, :, :] # [b, h_q, dv] + cal_diff(out_cutlass, out_torch_slice, "out", use_fp8) + + t = triton.testing.do_bench(cutlass_mla) + FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2 + bytes = (total_seqlens * h_kv * d + + b * s_q * h_q * d) * (torch.finfo(torch_dtype).bits // 8) + ( + b * s_q * h_q * dv) * (torch.finfo(init_dtype).bits // 8) + print(f"{t:.3f} ms, {FLOPS / 10 ** 9 / t:.0f} TFLOPS,", + f"{bytes / 10 ** 6 / t:.0f} GB/s") diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 5cbb7346436ef..c65c987c0e488 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -500,8 +500,8 @@ class CudaPlatformBase(Platform): else: attention_backend = "FLASHMLA" - # Only FlashMLA supports fp8 - if attention_backend == "FLASHMLA": + # Only FlashMLA and CUTLASS_MLA support fp8 + if attention_backend in ["FLASHMLA", "CUTLASS_MLA"]: supported = True else: supported = (not fp8_attention) diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 8a17d3a492783..705307d4dea3d 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -108,10 +108,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): "are not implemented for " "CutlassMLAImpl") - if is_quantized_kv_cache(self.kv_cache_dtype): - raise NotImplementedError( - "CutlassMLA V1 with FP8 KV cache not yet supported") - self._use_old_cutlass_mla = False force_old_cutlass = os.environ.get("FORCE_OLD_CUTLASS_MLA", None) if force_old_cutlass: @@ -182,11 +178,10 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): > 0), f"block num must be greater than 0, got {block_num}" assert block_num % (128 / PAGE_SIZE) == 0 - # TODO(kaixih@nvidia): support fp8 assert q_nope.dtype in ( - torch.float16, - torch.bfloat16, - ), f"q_nope.dtype needs to be fp16 or bf16 but got {q_nope.dtype}." + torch.float16, torch.bfloat16, torch.float8_e4m3fn), ( + f"q_nope.dtype needs to be fp16 or bf16 or e4m3 but got " + f"{q_nope.dtype}.") assert q_nope.dtype == q_pe.dtype == kv_c_and_k_pe_cache.dtype assert ( seq_lens.dtype == torch.int32 @@ -195,7 +190,9 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): page_table.dtype == torch.int32 ), f"page_table.dtype needs to be int32 but got {page_table.dtype}." - out = q_nope.new_empty((B_q, MAX_HEADS, D_latent)) + dtype = (torch.bfloat16 if is_quantized_kv_cache(self.kv_cache_dtype) + else q_nope.dtype) + out = q_nope.new_empty((B_q, MAX_HEADS, D_latent), dtype=dtype) ops.sm100_cutlass_mla_decode( out, @@ -220,9 +217,6 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): assert kv_c_and_k_pe_cache.numel() > 0 assert attn_metadata.decode is not None - if self.kv_cache_dtype.startswith("fp8"): - raise NotImplementedError("FP8 Cutlass MLA not yet supported") - # Adjust workspace size (if necessary) self._workspace.ensure_size(attn_metadata, self._num_kv_splits) @@ -252,8 +246,9 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): assert kv_c_and_k_pe_cache.numel() > 0 assert attn_metadata.decode is not None - if self.kv_cache_dtype.startswith("fp8"): - raise NotImplementedError("FP8 Cutlass MLA not yet supported") + if is_quantized_kv_cache(self.kv_cache_dtype): + raise NotImplementedError( + "FP8 Cutlass MLA not supported with FORCE_OLD_CUTLASS_MLA") B = q_nope.shape[0] From 6adaed42f49ff683f80521b73daa3a3bde413baa Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Thu, 4 Sep 2025 03:14:30 +0800 Subject: [PATCH 63/95] [Feature][P/D]: Optimize NIXL Connector xfer Launch (#23887) Signed-off-by: ycyaw66 <497410282@qq.com> Co-authored-by: ycyaw66 <497410282@qq.com> --- .../kv_connector/v1/nixl_connector.py | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 8f16babfe2aeb..de9cbc660666a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -14,6 +14,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional import msgspec +import numpy as np import torch import zmq @@ -1191,8 +1192,8 @@ class NixlConnectorWorker: # workers will issue xfers to parts of the P worker remote kv caches. # Get descs ids. - local_block_descs_ids: list[int] = [] - remote_block_descs_ids: list[int] = [] + local_block_descs_ids: np.ndarray + remote_block_descs_ids: np.ndarray if not self.block_window_per_layer: # Default case: assume global attention remote_block_descs_ids = self._get_block_descs_ids( @@ -1202,6 +1203,8 @@ class NixlConnectorWorker: else: # TODO(mgoin): remove this once we have hybrid memory allocator # Optimization for models with local attention (Llama 4) + local_descs_list = [] + remote_descs_list = [] for layer_idx, block_window in enumerate( self.block_window_per_layer): # For each layer: @@ -1221,8 +1224,11 @@ class NixlConnectorWorker: layer_remote_desc_ids = self._get_block_descs_ids( dst_engine_id, layer_remote_block_ids, layer_idx) - local_block_descs_ids.extend(layer_local_desc_ids) - remote_block_descs_ids.extend(layer_remote_desc_ids) + local_descs_list.append(layer_local_desc_ids) + remote_descs_list.append(layer_remote_desc_ids) + + local_block_descs_ids = np.concatenate(local_descs_list) + remote_block_descs_ids = np.concatenate(remote_descs_list) assert len(local_block_descs_ids) == len(remote_block_descs_ids) @@ -1247,14 +1253,14 @@ class NixlConnectorWorker: def _get_block_descs_ids(self, engine_id: str, block_ids: list[int], - layer_idx: Optional[int] = None) -> list[int]: + layer_idx: Optional[int] = None) -> np.ndarray: """ Get the descs ids for a set of block ids. If layer_idx is provided, we use the region_ids for the given layer. Otherwise, we use all regions. """ if layer_idx is None: - region_ids = range(self.num_regions) + region_ids = np.arange(self.num_regions) else: assert layer_idx < self.num_layers if self.num_layers < self.num_regions: @@ -1262,20 +1268,19 @@ class NixlConnectorWorker: # the regions are organized as [K0, V0, K1, V1, ...] # and we select K_i and V_i assert 2 * self.num_layers == self.num_regions - region_ids = range(2 * layer_idx, 2 * layer_idx + 2) + region_ids = np.arange(2 * layer_idx, 2 * layer_idx + 2) else: # Otherwise, we assume we have MLA and select i-th layer assert self.num_layers == self.num_regions - region_ids = range(layer_idx, layer_idx + 1) + region_ids = np.arange(layer_idx, layer_idx + 1) num_blocks = self.dst_num_blocks[engine_id] # Compute the desc ids for each block. - descs_ids: list[int] = [] - for reg_id in region_ids: - for block_id in block_ids: - descs_ids.append(reg_id * num_blocks + block_id) - return descs_ids + region_ids = region_ids[:, None] + block_ids = np.array(block_ids)[None, :] + descs_ids = region_ids * num_blocks + block_ids + return descs_ids.flatten() def get_backend_aware_kv_block_len(self): """ From a43a3f1770525b7ac88151667da76bc2f15ec50d Mon Sep 17 00:00:00 2001 From: Kebe Date: Thu, 4 Sep 2025 05:21:36 +0900 Subject: [PATCH 64/95] [Bugfix][DP] DP distribution does not require ray[default] (#23822) Signed-off-by: Kebe --- vllm/v1/engine/utils.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index 56ef8477d267a..ed0129fda9474 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -315,7 +315,6 @@ class CoreEngineActorManager: import ray from ray._private.state import available_resources_per_node - from ray.util.state import list_nodes logger.info("Creating placement groups for data parallel") dp_master_ip = \ @@ -324,31 +323,28 @@ class CoreEngineActorManager: local_engine_count = \ vllm_config.parallel_config.data_parallel_size_local - nodes = sorted(list_nodes(filters=[("state", "=", "ALIVE")]), - key=lambda node: node.node_ip != dp_master_ip) - assert nodes[0].node_ip == dp_master_ip, ( - "The head node is missing or dead") - assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, ( - "There can only be one head node") - available_resources = available_resources_per_node() world_size = vllm_config.parallel_config.world_size placement_groups: list[PlacementGroup] = [] local_dp_ranks: list[int] = [] - - for node in nodes: - node_ip = node.node_ip - node_resources = available_resources[node.node_id] + dp_master_ip_key = f'node:{dp_master_ip}' + nodes = sorted(available_resources.values(), + key=lambda x: dp_master_ip_key not in x) + assert len(nodes) > 0, ( + "No nodes with resources found in Ray cluster.") + assert dp_master_ip_key in nodes[0], ( + "The DP master node (ip: %s) is missing or dead", dp_master_ip) + for node_resources in nodes: if "GPU" not in node_resources: continue # For now, each DP rank can only be assigned to one node # TODO(rui): support allocating a single DP rank # to multiple nodes available_engine_count = int(node_resources["GPU"]) // world_size - if node_ip == dp_master_ip: + if dp_master_ip_key in node_resources: assert available_engine_count >= local_engine_count, ( "Not enough resources to allocate DP ranks " - f"on DP master node {node_ip}") + f"on DP master node {dp_master_ip}") for i in range(local_engine_count): bundles = [{ "GPU": 1.0, From 36c260dad604ccc845150753f2530b5b2ba9d7e6 Mon Sep 17 00:00:00 2001 From: George Nagy II Date: Wed, 3 Sep 2025 15:08:47 -0600 Subject: [PATCH 65/95] [Feature][gpt-oss] Add support for num_cached_tokens and num_reasoning_tokens tracking (#23460) Signed-off-by: George Nagy II Signed-off-by: Chen Zhang --- vllm/entrypoints/context.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 9d587e8669339..52e35bcac9619 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -93,17 +93,35 @@ class HarmonyContext(ConversationContext): # as new prompt each time. Hence the sum. self.num_prompt_tokens += len(output.prompt_token_ids) + def _update_num_cached_tokens(self, output: RequestOutput): + if output.num_cached_tokens is not None: + #Similar to num_prompt_tokens + self.num_cached_tokens += output.num_cached_tokens + def _update_num_output_tokens(self, token_ids: Sequence[int]): self.num_output_tokens += len(token_ids) + def _update_num_reasoning_tokens(self, token_ids: Sequence[int]): + # Count tokens that are part of reasoning content (analysis channel + # or tool-directed messages like python/browser calls) + is_analysis = self.parser.current_channel == "analysis" + is_tool_call = (self.parser.current_recipient is not None and + (self.parser.current_recipient.startswith("python") or + self.parser.current_recipient.startswith("browser."))) + if is_analysis or is_tool_call: + self.num_reasoning_tokens += len(token_ids) + def append_output(self, output) -> None: if isinstance(output, RequestOutput): self._update_num_prompt_tokens(output) + self._update_num_cached_tokens(output) output_token_ids = output.outputs[0].token_ids self._update_num_output_tokens(output_token_ids) self.parser = get_streamable_parser_for_assistant() for token_id in output_token_ids: self.parser.process(token_id) + # Check if the current token is part of reasoning content + self._update_num_reasoning_tokens([token_id]) output_msgs = self.parser.messages else: # Tool output. @@ -204,6 +222,7 @@ class StreamingHarmonyContext(HarmonyContext): # so we only want to add the prompt tokens once for each message. if self.first_tok_of_message: self._update_num_prompt_tokens(output) + self._update_num_cached_tokens(output) # Reset self.first_tok_of_message if needed: # if the current token is the last one of the current message # (finished=True), then the next token processed will mark the @@ -212,6 +231,8 @@ class StreamingHarmonyContext(HarmonyContext): tok = output.outputs[0].token_ids[0] self.parser.process(tok) self._update_num_output_tokens(output.outputs[0].token_ids) + # Check if the current token is part of reasoning content + self._update_num_reasoning_tokens([tok]) self.last_tok = tok else: # Handle the case of tool output in direct message format From b5ee1e3261d9edf94d76ba8b437ebdef7ac599ea Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Thu, 4 Sep 2025 06:49:16 +0800 Subject: [PATCH 66/95] Remove deprecated `PyNcclConnector` (#24151) Signed-off-by: Peter Pan --- .../disagg_benchmarks/disagg_overhead_benchmark.sh | 4 ++-- .../disagg_performance_benchmark.sh | 4 ++-- examples/offline_inference/disaggregated_prefill.py | 12 ++++++------ examples/online_serving/disaggregated_prefill.sh | 4 ++-- tests/kv_transfer/test_lookup_buffer.py | 2 +- tests/kv_transfer/test_send_recv.py | 2 +- vllm/config/__init__.py | 2 +- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh index 92f97ffabea2a..2c72941cf7e51 100644 --- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh @@ -62,7 +62,7 @@ benchmark() { --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & CUDA_VISIBLE_DEVICES=1 python3 \ @@ -72,7 +72,7 @@ benchmark() { --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & wait_for_server 8100 wait_for_server 8200 diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh index af2bcba3ea57a..0bbf7cd2b1c81 100644 --- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh +++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh @@ -69,7 +69,7 @@ launch_disagg_prefill() { --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & CUDA_VISIBLE_DEVICES=1 python3 \ -m vllm.entrypoints.openai.api_server \ @@ -78,7 +78,7 @@ launch_disagg_prefill() { --max-model-len 10000 \ --gpu-memory-utilization 0.6 \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & wait_for_server 8100 wait_for_server 8200 diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py index 05a361fee0717..f619fa584f801 100644 --- a/examples/offline_inference/disaggregated_prefill.py +++ b/examples/offline_inference/disaggregated_prefill.py @@ -30,12 +30,12 @@ def run_prefill(prefill_done): ] sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) - # Using PyNcclConnector to transmit KV caches between vLLM instances. + # Using P2pNcclConnector to transmit KV caches between vLLM instances. # This instance is the prefill node (kv_producer, rank 0). # The number of parallel instances for KV cache transfer is set to 2, - # as required for PyNcclConnector. + # as required for P2pNcclConnector. ktc = KVTransferConfig( - kv_connector="PyNcclConnector", + kv_connector="P2pNcclConnector", kv_role="kv_producer", kv_rank=0, kv_parallel_size=2, @@ -74,12 +74,12 @@ def run_decode(prefill_done): ] sampling_params = SamplingParams(temperature=0, top_p=0.95) - # Using PyNcclConnector to transmit KV caches between vLLM instances. + # Using P2pNcclConnector to transmit KV caches between vLLM instances. # This instance is the decode node (kv_consumer, rank 1). # The number of parallel instances for KV cache transfer is set to 2, - # as required for PyNcclConnector. + # as required for P2pNcclConnector. ktc = KVTransferConfig( - kv_connector="PyNcclConnector", + kv_connector="P2pNcclConnector", kv_role="kv_consumer", kv_rank=1, kv_parallel_size=2, diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh index 6925dc8af07e9..d434e22b1ae88 100644 --- a/examples/online_serving/disaggregated_prefill.sh +++ b/examples/online_serving/disaggregated_prefill.sh @@ -53,7 +53,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \ --gpu-memory-utilization 0.8 \ --trust-remote-code \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' & # decoding instance, which is the KV consumer CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \ @@ -62,7 +62,7 @@ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \ --gpu-memory-utilization 0.8 \ --trust-remote-code \ --kv-transfer-config \ - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' & # wait until prefill and decode instances are ready wait_for_server 8100 diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py index 352ab63552de7..ca2f04dabfc98 100644 --- a/tests/kv_transfer/test_lookup_buffer.py +++ b/tests/kv_transfer/test_lookup_buffer.py @@ -128,7 +128,7 @@ if __name__ == "__main__": print(f"initialized! My rank is {my_rank}") config = KVTransferConfig( - kv_connector='PyNcclConnector', + kv_connector='P2pNcclConnector', kv_buffer_device='cuda', kv_buffer_size=1e9, kv_rank=my_rank, diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py index 32116608a2177..99ad2b43aeac8 100644 --- a/tests/kv_transfer/test_send_recv.py +++ b/tests/kv_transfer/test_send_recv.py @@ -137,7 +137,7 @@ if __name__ == "__main__": ) config = KVTransferConfig( - kv_connector='PyNcclConnector', + kv_connector='P2pNcclConnector', kv_buffer_device='cuda', kv_buffer_size=1e9, kv_rank=my_rank, diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index fd3ad2c8a6d6a..2cea2695a66e5 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3247,7 +3247,7 @@ class KVTransferConfig: kv_parallel_size: int = 1 """The number of parallel instances for KV cache transfer. For - PyNcclConnector, this should be 2.""" + P2pNcclConnector, this should be 2.""" kv_ip: str = "127.0.0.1" """The KV connector ip, used to build distributed connection.""" From a38f8bd54c861d37acd1bf6497b86edf664a6ab7 Mon Sep 17 00:00:00 2001 From: wuhang Date: Thu, 4 Sep 2025 12:05:10 +0800 Subject: [PATCH 67/95] [Feature][Responses API]Support MCP tools with streaming mode + background mode (#23927) Signed-off-by: wuhang --- .../openai/test_response_api_with_harmony.py | 19 ++- vllm/entrypoints/openai/api_server.py | 16 ++- vllm/entrypoints/openai/serving_responses.py | 129 ++++++++++++++---- 3 files changed, 138 insertions(+), 26 deletions(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 72d468db08f65..0d5836fab5a7c 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -275,7 +275,8 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_streaming(client: OpenAI, model_name: str): +@pytest.mark.parametrize("background", [True, False]) +async def test_streaming(client: OpenAI, model_name: str, background: bool): # TODO: Add back when web search and code interpreter are available in CI prompts = [ "tell me a story about a cat in 20 words", @@ -300,11 +301,16 @@ async def test_streaming(client: OpenAI, model_name: str): # }, ], stream=True, + background=background, ) events = [] current_event_mode = None + resp_id = None async for event in response: + if event.type == "response.created": + resp_id = event.response.id + if current_event_mode != event.type: current_event_mode = event.type print(f"\n[{event.type}] ", end="", flush=True) @@ -322,6 +328,17 @@ async def test_streaming(client: OpenAI, model_name: str): assert len(events) > 0 + if background: + starting_after = 5 + async with await client.responses.retrieve( + response_id=resp_id, + stream=True, + starting_after=starting_after) as stream: + counter = starting_after + async for event in stream: + counter += 1 + assert event == events[counter] + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3cebfdf885bec..b6667ebf152e1 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -616,14 +616,23 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): @router.get("/v1/responses/{response_id}") -async def retrieve_responses(response_id: str, raw_request: Request): +async def retrieve_responses( + response_id: str, + raw_request: Request, + starting_after: Optional[int] = None, + stream: Optional[bool] = False, +): handler = responses(raw_request) if handler is None: return base(raw_request).create_error_response( message="The model does not support Responses API") try: - response = await handler.retrieve_responses(response_id) + response = await handler.retrieve_responses( + response_id, + starting_after=starting_after, + stream=stream, + ) except Exception as e: raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)) from e @@ -631,6 +640,9 @@ async def retrieve_responses(response_id: str, raw_request: Request): if isinstance(response, ErrorResponse): return JSONResponse(content=response.model_dump(), status_code=response.error.code) + elif stream: + return StreamingResponse(content=response, + media_type="text/event-stream") return JSONResponse(content=response.model_dump()) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 7f11b37e51728..58424c9d9f7be 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -4,6 +4,7 @@ import asyncio import json import time +from collections import deque from collections.abc import AsyncGenerator, AsyncIterator, Sequence from contextlib import AsyncExitStack from copy import copy @@ -55,7 +56,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, # yapf: enable from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.entrypoints.tool_server import MCPToolServer, ToolServer +from vllm.entrypoints.tool_server import ToolServer from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.logger import init_logger from vllm.logprobs import Logprob as SampleLogprob @@ -168,6 +169,11 @@ class OpenAIServingResponses(OpenAIServing): # never remove messages from the store. self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {} + # HACK(wuhang): This is a hack. We should use a better store. + # FIXME: If enable_store=True, this may cause a memory leak since we + # never remove events from the store. + self.event_store: dict[str, tuple[deque[str], asyncio.Event]] = {} + self.background_tasks: dict[str, asyncio.Task] = {} self.tool_server = tool_server @@ -249,15 +255,6 @@ class OpenAIServingResponses(OpenAIServing): if raw_request: raw_request.state.request_metadata = request_metadata - if self.tool_server is not None and isinstance( - self.tool_server, - MCPToolServer) and request.stream and request.tools and any( - tool.type in ["web_search_preview", "code_interpreter"] - for tool in request.tools): - return self.create_error_response( - "MCP tool server is not supported in background mode and " - "streaming mode") - # Schedule the request and get the result generator. generators: list[AsyncGenerator[ConversationContext, None]] = [] @@ -329,25 +326,44 @@ class OpenAIServingResponses(OpenAIServing): self.response_store[response.id] = response # Run the request in the background. - task = asyncio.create_task( - self._run_background_request( - request, - sampling_params, - result_generator, - context, - model_name, - tokenizer, - request_metadata, - created_time, - ), - name=f"create_{response.id}", - ) + if request.stream: + task = asyncio.create_task( + self._run_background_request_stream( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + created_time, + ), + name=f"create_{request.request_id}", + ) + else: + task = asyncio.create_task( + self._run_background_request( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + created_time, + ), + name=f"create_{response.id}", + ) # For cleanup. response_id = response.id self.background_tasks[response_id] = task task.add_done_callback( lambda _: self.background_tasks.pop(response_id, None)) + + if request.stream: + return self.responses_background_stream_generator( + request.request_id) return response if request.stream: @@ -736,6 +752,40 @@ class OpenAIServingResponses(OpenAIServing): prev_outputs.append(response_msg) return messages + async def _run_background_request_stream( + self, + request: ResponsesRequest, + *args, + **kwargs, + ): + event_deque: deque[str] = deque() + new_event_signal = asyncio.Event() + self.event_store[request.request_id] = (event_deque, new_event_signal) + response = None + try: + generator = self.responses_stream_generator( + request, *args, **kwargs) + async for event in generator: + event_deque.append(event) + new_event_signal.set() # Signal new event available + except Exception as e: + logger.exception("Background request failed for %s", + request.request_id) + response = self.create_error_response(str(e)) + finally: + # Mark as finished with a special marker + event_deque.append("__STREAM_END__") + new_event_signal.set() + + if response is not None and isinstance(response, ErrorResponse): + # If the request has failed, update the status to "failed". + response_id = request.request_id + async with self.response_store_lock: + stored_response = self.response_store.get(response_id) + assert stored_response is not None + if stored_response.status not in ("completed", "cancelled"): + stored_response.status = "failed" + async def _run_background_request( self, request: ResponsesRequest, @@ -759,9 +809,36 @@ class OpenAIServingResponses(OpenAIServing): if stored_response.status not in ("completed", "cancelled"): stored_response.status = "failed" + async def responses_background_stream_generator( + self, + response_id: str, + starting_after: Optional[int] = None, + ): + if response_id not in self.event_store: + raise ValueError(f"Unknown response_id: {response_id}") + + event_deque, new_event_signal = self.event_store[response_id] + start_index = 0 if starting_after is None else starting_after + 1 + current_index = start_index + + while True: + new_event_signal.clear() + + # Yield existing events from start_index + while current_index < len(event_deque): + event = event_deque[current_index] + if event == "__STREAM_END__": + return + yield event + current_index += 1 + + await new_event_signal.wait() + async def retrieve_responses( self, response_id: str, + starting_after: Optional[int], + stream: Optional[bool], ) -> Union[ErrorResponse, ResponsesResponse]: if not response_id.startswith("resp_"): return self._make_invalid_id_error(response_id) @@ -771,6 +848,12 @@ class OpenAIServingResponses(OpenAIServing): if response is None: return self._make_not_found_error(response_id) + + if stream: + return self.responses_background_stream_generator( + response_id, + starting_after, + ) return response async def cancel_responses( From e919d6f549f4da22fa60ea394f00aaf93ef23aa0 Mon Sep 17 00:00:00 2001 From: Qiming Zhang Date: Wed, 3 Sep 2025 21:37:37 -0700 Subject: [PATCH 68/95] [Kernel][Bugfix] Fix grouped topk cu (#24146) Signed-off-by: mayuyuace --- csrc/moe/grouped_topk_kernels.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu index 78f7b3cc1aa25..accbb09858fac 100644 --- a/csrc/moe/grouped_topk_kernels.cu +++ b/csrc/moe/grouped_topk_kernels.cu @@ -28,6 +28,7 @@ namespace cg = cooperative_groups; namespace vllm { namespace moe { +constexpr float kNegInfinity = INFINITY * -1; constexpr unsigned FULL_WARP_MASK = 0xffffffff; constexpr int32_t WARP_SIZE = 32; constexpr int32_t BLOCK_SIZE = 512; @@ -512,8 +513,8 @@ __global__ void group_idx_and_topk_idx_kernel( warp_id * topk; s_topk_idx += warp_id * topk; - T value = cuda::std::numeric_limits::min(); - T topk_group_value = cuda::std::numeric_limits::min(); + T value = kNegInfinity; + T topk_group_value = kNegInfinity; int32_t num_equalto_topkth_group; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) @@ -539,11 +540,11 @@ __global__ void group_idx_and_topk_idx_kernel( __syncwarp(); // Ensure all threads have valid data before reduction topk_group_value = cg::reduce(tile, value, cg::greater()); if (value == topk_group_value) { - value = cuda::std::numeric_limits::min(); + value = kNegInfinity; } pre_count_equal_to_top_value = count_equal_to_top_value; count_equal_to_top_value = __popc(__ballot_sync( - FULL_WARP_MASK, (value == cuda::std::numeric_limits::min()))); + FULL_WARP_MASK, (value == cuda_cast(kNegInfinity)))); } num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value; } @@ -555,7 +556,7 @@ __global__ void group_idx_and_topk_idx_kernel( int count_equalto_topkth_group = 0; bool if_proceed_next_topk = - (topk_group_value != cuda::std::numeric_limits::min()); + (topk_group_value != cuda_cast(kNegInfinity)); if (case_id < num_tokens && if_proceed_next_topk) { for (int i_group = 0; i_group < n_group; i_group++) { if ((group_scores[i_group] > topk_group_value) || @@ -568,7 +569,7 @@ __global__ void group_idx_and_topk_idx_kernel( (i < num_experts_per_group) && isfinite(cuda_cast( scores_with_bias[offset + i])) ? scores_with_bias[offset + i] - : cuda::std::numeric_limits::min(); + : cuda_cast(kNegInfinity); queue.add(candidates, offset + i); } if (group_scores[i_group] == topk_group_value) { From 712b273f655fe007e4efbaae00f1a343cfeb0742 Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Wed, 3 Sep 2025 22:21:12 -0700 Subject: [PATCH 69/95] [Refactor] Introduce basic Renderer for completion-style request (#24010) Signed-off-by: sfeng33 <4florafeng@gmail.com> --- tests/entrypoints/test_renderer.py | 163 +++++++++++++ vllm/entrypoints/openai/serving_engine.py | 20 +- vllm/entrypoints/openai/serving_pooling.py | 26 +-- .../openai/serving_tokenization.py | 15 +- vllm/entrypoints/renderer.py | 219 ++++++++++++++++++ 5 files changed, 416 insertions(+), 27 deletions(-) create mode 100644 tests/entrypoints/test_renderer.py create mode 100644 vllm/entrypoints/renderer.py diff --git a/tests/entrypoints/test_renderer.py b/tests/entrypoints/test_renderer.py new file mode 100644 index 0000000000000..54b5271ba67a6 --- /dev/null +++ b/tests/entrypoints/test_renderer.py @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from typing import Optional +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from vllm.entrypoints.renderer import CompletionRenderer + + +@dataclass +class MockModelConfig: + max_model_len: int = 100 + encoder_config: Optional[dict] = None + + +class MockTokenizerResult: + + def __init__(self, input_ids): + self.input_ids = input_ids + + +@pytest.fixture +def mock_model_config(): + return MockModelConfig() + + +@pytest.fixture +def mock_tokenizer(): + tokenizer = MagicMock() + return tokenizer + + +@pytest.fixture +def mock_async_tokenizer(): + async_tokenizer = AsyncMock() + return async_tokenizer + + +@pytest.fixture +def renderer(mock_model_config, mock_tokenizer): + return CompletionRenderer(model_config=mock_model_config, + tokenizer=mock_tokenizer, + async_tokenizer_pool={}) + + +class TestRenderPrompt: + """Test Category A: Basic Functionality Tests""" + + @pytest.mark.asyncio + async def test_token_input(self, renderer): + tokens = [101, 7592, 2088] + results = await renderer.render_prompt(prompt_or_prompts=tokens, + max_length=100) + + assert len(results) == 1 + assert results[0]["prompt_token_ids"] == tokens + + @pytest.mark.asyncio + async def test_token_list_input(self, renderer): + token_lists = [[101, 7592, 2088], [102, 1234, 5678, 9012], [103, 4567]] + results = await renderer.render_prompt(prompt_or_prompts=token_lists, + max_length=100) + + assert len(results) == 3 + assert results[0]["prompt_token_ids"] == [101, 7592, 2088] + assert results[1]["prompt_token_ids"] == [102, 1234, 5678, 9012] + assert results[2]["prompt_token_ids"] == [103, 4567] + + @pytest.mark.asyncio + async def test_text_input(self, renderer, mock_async_tokenizer): + mock_async_tokenizer.return_value = MockTokenizerResult( + [101, 7592, 2088]) + renderer.async_tokenizer_pool[ + renderer.tokenizer] = mock_async_tokenizer + + results = await renderer.render_prompt(prompt_or_prompts="Hello world", + max_length=100) + + assert len(results) == 1 + assert results[0]["prompt_token_ids"] == [101, 7592, 2088] + mock_async_tokenizer.assert_called_once() + + @pytest.mark.asyncio + async def test_text_list_input(self, renderer, mock_async_tokenizer): + mock_async_tokenizer.return_value = MockTokenizerResult( + [101, 7592, 2088]) + renderer.async_tokenizer_pool[ + renderer.tokenizer] = mock_async_tokenizer + + text_list_input = ["Hello world", "How are you?", "Good morning"] + results = await renderer.render_prompt( + prompt_or_prompts=text_list_input, max_length=100) + + assert len(results) == 3 + for result in results: + assert result["prompt_token_ids"] == [101, 7592, 2088] + assert mock_async_tokenizer.call_count == 3 + + @pytest.mark.asyncio + async def test_no_truncation(self, renderer, mock_async_tokenizer): + mock_async_tokenizer.return_value = MockTokenizerResult( + [101, 7592, 2088]) + renderer.async_tokenizer_pool[ + renderer.tokenizer] = mock_async_tokenizer + + results = await renderer.render_prompt(prompt_or_prompts="Hello world", + max_length=100) + + assert len(results) == 1 + call_args = mock_async_tokenizer.call_args + assert "truncation" not in call_args.kwargs or call_args.kwargs[ + "truncation"] is False + + @pytest.mark.asyncio + async def test_truncation_positive(self, renderer, mock_async_tokenizer): + mock_async_tokenizer.return_value = MockTokenizerResult( + [101, 7592, 2088]) # Truncated + renderer.async_tokenizer_pool[ + renderer.tokenizer] = mock_async_tokenizer + + results = await renderer.render_prompt(prompt_or_prompts="Hello world", + max_length=100, + truncate_prompt_tokens=50) + + assert len(results) == 1 + call_args = mock_async_tokenizer.call_args + assert call_args.kwargs["truncation"] is True + assert call_args.kwargs["max_length"] == 50 + + @pytest.mark.asyncio + async def test_token_truncation_last_elements(self, renderer): + # Test that token truncation keeps the last N elements + long_tokens = [100, 101, 102, 103, 104, 105, 106, 107, 108, + 109] # 10 tokens + results = await renderer.render_prompt(prompt_or_prompts=long_tokens, + max_length=100, + truncate_prompt_tokens=5) + + assert len(results) == 1 + # Should keep the last 5 tokens: [105, 106, 107, 108, 109] + assert results[0]["prompt_token_ids"] == [105, 106, 107, 108, 109] + + @pytest.mark.asyncio + async def test_max_length_exceeded(self, renderer): + long_tokens = list(range(150)) # Exceeds max_model_len=100 + + with pytest.raises(ValueError, match="maximum context length"): + await renderer.render_prompt(prompt_or_prompts=long_tokens, + max_length=100) + + @pytest.mark.asyncio + async def test_no_tokenizer_for_text(self, mock_model_config): + renderer_no_tokenizer = CompletionRenderer( + model_config=mock_model_config, + tokenizer=None, + async_tokenizer_pool={}) + + with pytest.raises(ValueError, match="No tokenizer available"): + await renderer_no_tokenizer.render_prompt( + prompt_or_prompts="Hello world", max_length=100) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index f506f7de16828..a218f6882f8ca 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -62,8 +62,10 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, TranslationRequest) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser +from vllm.entrypoints.renderer import BaseRenderer, CompletionRenderer # yapf: enable from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt +from vllm.inputs.data import PromptType from vllm.inputs.data import TokensPrompt as EngineTokensPrompt from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger @@ -243,6 +245,16 @@ class OpenAIServing: AsyncMicrobatchTokenizer] = {} self.log_error_stack = log_error_stack + def _get_renderer(self, tokenizer: Optional[AnyTokenizer]) -> BaseRenderer: + """ + Get a Renderer instance with the provided tokenizer. + Uses shared async tokenizer pool for efficiency. + """ + return CompletionRenderer( + model_config=self.model_config, + tokenizer=tokenizer, + async_tokenizer_pool=self._async_tokenizer_pool) + def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer: """ Return (and cache) an `AsyncMicrobatchTokenizer` bound to the @@ -1098,7 +1110,7 @@ class OpenAIServing: def _log_inputs( self, request_id: str, - inputs: RequestPrompt, + inputs: Union[RequestPrompt, PromptType], params: Optional[Union[SamplingParams, PoolingParams, BeamSearchParams]], lora_request: Optional[LoRARequest], @@ -1110,11 +1122,9 @@ class OpenAIServing: prompt = inputs elif isinstance(inputs, list): prompt_token_ids = inputs - elif "prompt_embeds" in inputs: - prompt_embeds = inputs.get("prompt_embeds") else: - prompt = inputs["prompt"] - prompt_token_ids = inputs["prompt_token_ids"] + prompt = getattr(inputs, 'prompt', None) + prompt_token_ids = getattr(inputs, 'prompt_token_ids', None) self.request_logger.log_inputs( request_id, diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 685c98c817c3d..c08c0743ffca6 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -4,7 +4,7 @@ import asyncio import base64 import time -from collections.abc import AsyncGenerator, Sequence +from collections.abc import AsyncGenerator from typing import Final, Literal, Optional, Union, cast import jinja2 @@ -26,7 +26,7 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, PoolingRequest, PoolingResponse, PoolingResponseData, UsageInfo) # yapf: enable -from vllm.entrypoints.openai.serving_engine import OpenAIServing, RequestPrompt +from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.utils import _validate_truncation_size from vllm.logger import init_logger @@ -104,6 +104,7 @@ class OpenAIServingPooling(OpenAIServing): else: tokenizer = await self.engine_client.get_tokenizer(lora_request ) + renderer = self._get_renderer(tokenizer) if getattr(request, "dimensions", None) is not None: return self.create_error_response( @@ -126,14 +127,11 @@ class OpenAIServingPooling(OpenAIServing): engine_prompts = await self.io_processor.pre_process_async( prompt=validated_prompt, request_id=request_id) - request_prompts: Sequence[RequestPrompt] = [ - "" - ] * len(engine_prompts) elif isinstance(request, PoolingChatRequest): ( _, - request_prompts, + _, engine_prompts, ) = await self._preprocess_chat( request, @@ -149,13 +147,13 @@ class OpenAIServingPooling(OpenAIServing): add_special_tokens=request.add_special_tokens, ) elif isinstance(request, PoolingCompletionRequest): - (request_prompts, - engine_prompts) = await self._preprocess_completion( - request, - tokenizer, - request.input, - add_special_tokens=request.add_special_tokens, - ) + engine_prompts = await renderer.render_prompt( + prompt_or_prompts=request.input, + max_length=self.max_model_len, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + cache_salt=getattr(request, 'cache_salt', None), + ) else: raise ValueError( f"Unsupported request of type {type(request)}") @@ -177,7 +175,7 @@ class OpenAIServingPooling(OpenAIServing): request_id_item = f"{request_id}-{i}" self._log_inputs(request_id_item, - request_prompts[i], + engine_prompt, params=pooling_params, lora_request=lora_request) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 2f258255d5f16..70cb6c21b2213 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -65,6 +65,7 @@ class OpenAIServingTokenization(OpenAIServing): lora_request = self._maybe_get_adapters(request) tokenizer = await self.engine_client.get_tokenizer(lora_request) + renderer = self._get_renderer(tokenizer) if isinstance(request, TokenizeChatRequest): tool_dicts = (None if request.tools is None else @@ -87,13 +88,11 @@ class OpenAIServingTokenization(OpenAIServing): add_special_tokens=request.add_special_tokens, ) else: - (request_prompts, - engine_prompts) = await self._preprocess_completion( - request, - tokenizer, - request.prompt, - add_special_tokens=request.add_special_tokens, - ) + engine_prompts = await renderer.render_prompt( + prompt_or_prompts=request.prompt, + add_special_tokens=request.add_special_tokens, + cache_salt=getattr(request, 'cache_salt', None), + ) except (ValueError, TypeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(f"{e} {e.__cause__}") @@ -101,7 +100,7 @@ class OpenAIServingTokenization(OpenAIServing): input_ids: list[int] = [] for i, engine_prompt in enumerate(engine_prompts): self._log_inputs(request_id, - request_prompts[i], + engine_prompt, params=None, lora_request=lora_request) diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py new file mode 100644 index 0000000000000..29200dda8998a --- /dev/null +++ b/vllm/entrypoints/renderer.py @@ -0,0 +1,219 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +from abc import ABC, abstractmethod +from typing import Annotated, Optional, Union + +from pydantic import Field + +from vllm.config import ModelConfig +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt +from vllm.inputs.parse import parse_and_batch_prompt +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import AsyncMicrobatchTokenizer + + +class BaseRenderer(ABC): + """ + Base class for unified input processing and rendering. + + The Renderer serves as a unified input processor that consolidates + tokenization, chat template formatting, and multimodal input handling + into a single component. + It converts high-level API requests (OpenAI-style JSON) into token IDs and + multimodal features ready for engine consumption. + + Key responsibilities: + - Convert text prompts to token sequences with proper special tokens + - Apply chat templates and format conversations + - Handle multimodal inputs (images, audio, etc.) when applicable + - Manage prompt truncation and length validation + - Provide clean separation between API layer and engine core + """ + + def __init__( + self, + model_config: ModelConfig, + tokenizer: Optional[AnyTokenizer] = None, + ): + super().__init__() + self.model_config = model_config + self.tokenizer = tokenizer + + @abstractmethod + async def render_prompt( + self, + prompt_or_prompts: Union[str, list[str], list[int], list[list[int]]], + max_length: Optional[int] = None, + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, + add_special_tokens: Optional[bool] = True, + cache_salt: Optional[str] = None, + ) -> list[EngineTokensPrompt]: + """ + Convert input prompts into tokenized format for engine processing. + + This is the core method that transforms various input formats into + standardized TokensPrompt objects. Implementations should handle + tokenization, special token insertion, truncation, and validation + according to model requirements. + + Args: + prompt_or_prompts: Input data in various formats: + - str: Single text prompt + - list[str]: Batch of text prompts + - list[int]: Pre-tokenized sequence + - list[list[int]]: Batch of pre-tokenized sequences + max_length: Maximum sequence length (endpoint-specific behavior) + truncate_prompt_tokens: Truncate to last N tokens + (None=no truncation, 0=empty) + add_special_tokens: Add model-specific tokens (e.g., [CLS], [SEP]) + to text inputs + cache_salt: Optional string to disambiguate cached prompts + + Returns: + list[EngineTokensPrompt]: Tokenized prompts ready for engine + consumption + + Raises: + ValueError: If input format is invalid or length limits exceeded + """ + raise NotImplementedError + + +class CompletionRenderer(BaseRenderer): + + def __init__( + self, + model_config: ModelConfig, + tokenizer: Optional[AnyTokenizer] = None, + async_tokenizer_pool: Optional[dict[AnyTokenizer, + AsyncMicrobatchTokenizer]] = None, + ): + super().__init__(model_config, tokenizer) + self.async_tokenizer_pool = async_tokenizer_pool or {} + self.async_tokenizer: Optional[AsyncMicrobatchTokenizer] = None + + async def render_prompt( + self, + prompt_or_prompts: Union[str, list[str], list[int], list[list[int]]], + max_length: Optional[int] = None, + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None, + add_special_tokens: Optional[bool] = True, + cache_salt: Optional[str] = None, + ) -> list[EngineTokensPrompt]: + """Implementation of prompt rendering for completion-style requests. + + Uses async tokenizer pooling for improved performance. See base class + for detailed parameter documentation. + """ + if truncate_prompt_tokens is not None: + if max_length is not None: + assert 0 <= truncate_prompt_tokens <= max_length + if truncate_prompt_tokens == 0: + return [] + + # Parse and batch the input prompts + batch_inputs = parse_and_batch_prompt(prompt_or_prompts) + + rendered_prompts: list[EngineTokensPrompt] = [] + tokenize_tasks = [] + for prompt_input in batch_inputs: + if prompt_input["is_tokens"] is True: + # Token input + token_ids = self._maybe_apply_truncation( + prompt_input["content"], truncate_prompt_tokens) + rendered_prompts.append( + self._create_tokens_prompt(token_ids, max_length, + cache_salt)) + else: + # Text input + tokenize_task = asyncio.create_task( + self._tokenize(prompt_input["content"], max_length, + truncate_prompt_tokens, add_special_tokens, + cache_salt)) + tokenize_tasks.append(tokenize_task) + + # Wait for all text tokenization to finish + if tokenize_tasks: + tokenized_text_prompts = await asyncio.gather(*tokenize_tasks) + rendered_prompts.extend(tokenized_text_prompts) + + return rendered_prompts + + def _maybe_apply_truncation( + self, token_ids: list[int], + truncate_prompt_tokens: Optional[int]) -> list[int]: + """Apply truncation to token sequence.""" + if truncate_prompt_tokens is None: + return token_ids + if truncate_prompt_tokens >= len(token_ids): + return token_ids + + return token_ids[-truncate_prompt_tokens:] + + async def _tokenize( + self, + text: str, + max_length: Optional[int], + truncate_prompt_tokens: Optional[int], + add_special_tokens: Optional[bool], + cache_salt: Optional[str], + ) -> EngineTokensPrompt: + """Tokenize text input asynchronously.""" + async_tokenizer = self._get_async_tokenizer() + + # Handle encoder-specific preprocessing + if (self.model_config.encoder_config is not None + and self.model_config.encoder_config.get( + "do_lower_case", False)): + text = text.lower() + + # Tokenize texts + if truncate_prompt_tokens is None: + encoded = await async_tokenizer( + text, add_special_tokens=add_special_tokens) + else: + encoded = await async_tokenizer( + text, + add_special_tokens=add_special_tokens, + truncation=True, + max_length=truncate_prompt_tokens) + + return self._create_tokens_prompt(encoded.input_ids, max_length, + cache_salt) + + def _get_async_tokenizer(self) -> AsyncMicrobatchTokenizer: + """Get or create async tokenizer using shared pool.""" + if self.async_tokenizer is not None: + return self.async_tokenizer + if self.tokenizer is None: + raise ValueError( + "No tokenizer available for text input processing") + + # Check shared pool first + if self.tokenizer in self.async_tokenizer_pool: + return self.async_tokenizer_pool[self.tokenizer] + + # Create new async tokenizer and add to pool + self.async_tokenizer = AsyncMicrobatchTokenizer(self.tokenizer) + self.async_tokenizer_pool[self.tokenizer] = self.async_tokenizer + return self.async_tokenizer + + def _create_tokens_prompt( + self, + token_ids: list[int], + max_length: Optional[int] = None, + cache_salt: Optional[str] = None, + ) -> EngineTokensPrompt: + """Create validated EngineTokensPrompt.""" + if max_length is not None and len(token_ids) > max_length: + raise ValueError( + f"This maximum context length is {max_length} tokens. " + f"However, your request has {len(token_ids)} input tokens. " + "Please reduce the length of the input messages.") + + tokens_prompt = EngineTokensPrompt(prompt_token_ids=token_ids) + if cache_salt is not None: + tokens_prompt["cache_salt"] = cache_salt + return tokens_prompt From cb55ad86fe47759c22de06b3b3b93b3aa28690a2 Mon Sep 17 00:00:00 2001 From: Benji Beck Date: Wed, 3 Sep 2025 23:09:11 -0700 Subject: [PATCH 70/95] Migrate ultravox inputs to TensorSchema (#23503) Signed-off-by: Benji Beck --- vllm/model_executor/models/ultravox.py | 60 +++++++++++++------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index f91c4ddb6e834..c883065805279 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -4,7 +4,7 @@ # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Literal, Optional, TypedDict, Union +from typing import Annotated, Any, Literal, Optional, Union import torch from torch import nn @@ -31,6 +31,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig +from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP) @@ -43,26 +44,37 @@ _AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>" _MAX_ENCODER_BATCH_SIZE = 16 -class UltravoxAudioFeatureInputs(TypedDict): +class UltravoxAudioFeatureInputs(TensorSchema): + """ + Dimensions: + - b: batch size + - n: number of chunks + - t: Time frames (M) + - nmb: Number of mel bins + """ type: Literal["audio_features"] - data: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]] - """Shape: `(batch_size, num_chunks, 80, M)`""" - lens: Union[torch.Tensor, list[torch.Tensor]] - """ - Length of the audio frames. Used for attention mask in WhisperEncoder. - Shape: `(batch_size, num_chunks)` - """ - token_len: Union[torch.Tensor, list[torch.Tensor]] - """ - Length of the audio tokens. Used for flattening the audio features. - Shape: `(batch_size, num_chunks)` - """ + data: Annotated[Union[torch.Tensor, list[torch.Tensor], + list[list[torch.Tensor]]], + TensorShape("b", "n", "nmb", "t", dynamic_dims={"n"})] + lens: Annotated[Union[torch.Tensor, list[torch.Tensor]], + TensorShape("b", "n", dynamic_dims={"n"})] + """Length of the audio frames. Used for attention mask in WhisperEncoder.""" + token_len: Annotated[Union[torch.Tensor, list[torch.Tensor]], + TensorShape("b", "n", dynamic_dims={"n"})] + """Length of the audio tokens. Used for flattening the audio features.""" -class UltravoxAudioEmbeddingInputs(TypedDict): +class UltravoxAudioEmbeddingInputs(TensorSchema): + """ + Dimensions: + - b: batch size + - na: number of audios + - afs: audio feature size + - hs: hidden size + """ type: Literal["audio_embeds"] - data: NestedTensors - """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)`""" + data: Annotated[Union[torch.Tensor, list[torch.Tensor]], + TensorShape("b", "na", "afs", "hs")] UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs, @@ -484,26 +496,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): return None if audio_features is not None: - if not isinstance(audio_features, (torch.Tensor, list)): - raise ValueError("Incorrect type of audio features. " - f"Got type: {type(audio_features)}") - if not isinstance(audio_lens, (torch.Tensor, list)): - raise ValueError("Incorrect type of audio_lens. " - f"Got type: {type(audio_features)}") - if not isinstance(audio_token_len, (torch.Tensor, list)): - raise ValueError("Incorrect type of audio_token_len. " - f"Got type: {type(audio_features)}") - return UltravoxAudioFeatureInputs(type="audio_features", data=audio_features, lens=audio_lens, token_len=audio_token_len) if audio_embeds is not None: - if not isinstance(audio_embeds, (torch.Tensor, list)): - raise ValueError("Incorrect type of audio embeds. " - f"Got type: {type(audio_embeds)}") - return UltravoxAudioEmbeddingInputs(type="audio_embeds", data=audio_embeds) From 57b1ce94f7ac13624a32da90658b36b60a276a60 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 4 Sep 2025 14:28:45 +0800 Subject: [PATCH 71/95] [CPU] Refactor CPU unquantized linear (#24150) Signed-off-by: jiang1.li --- csrc/cpu/dnnl_helper.cpp | 177 ++++++++++++++++++ csrc/cpu/dnnl_helper.h | 74 ++++++++ csrc/cpu/dnnl_kernels.cpp | 54 ++++++ csrc/cpu/torch_bindings.cpp | 18 ++ tests/kernels/test_onednn.py | 70 +++++++ vllm/_custom_ops.py | 29 +++ vllm/model_executor/layers/linear.py | 25 +-- vllm/model_executor/layers/utils.py | 39 +++- .../layers/vocab_parallel_embedding.py | 6 + 9 files changed, 466 insertions(+), 26 deletions(-) diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp index f3f00edb36068..6def0e061fa96 100644 --- a/csrc/cpu/dnnl_helper.cpp +++ b/csrc/cpu/dnnl_helper.cpp @@ -22,6 +22,23 @@ void release_dnnl_matmul_handler(int64_t handler) { delete ptr; } +DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) { + this->realloc(allocation_unit * 128); +} + +void DNNLScratchPadManager::realloc(size_t new_size) { + new_size = round(new_size); + if (new_size > size_) { + ptr_ = std::aligned_alloc(64, new_size); + size_ = new_size; + } +} + +DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() { + static DNNLScratchPadManager manager; + return &manager; +} + template class DNNLPrimitiveCache { public: @@ -166,6 +183,23 @@ struct hash { hash()(static_cast(val.bias_type)); } }; + +template <> +struct hash { + size_t operator()( + const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const { + return hash()(val.b_n_size) ^ hash()(val.b_k_size); + } +}; + +template <> +struct hash { + size_t operator()(const MatMulPrimitiveHandler::MSizeCacheKey& val) const { + return hash()(val.a_m_size) ^ + hash()(val.a_m_stride) ^ hash()(val.use_bias) ^ + hash()(static_cast(val.bias_type)); + } +}; } // namespace std bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l, @@ -181,6 +215,17 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l, l.bias_type == r.bias_type; } +bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l, + const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) { + return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size; +} + +bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l, + const MatMulPrimitiveHandler::MSizeCacheKey& r) { + return l.a_m_size == r.a_m_size && l.a_m_stride == r.a_m_stride && + l.use_bias == r.use_bias && l.bias_type == r.bias_type; +} + static std::shared_ptr get_w8a8_class_primitive_cache( const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key, @@ -239,6 +284,11 @@ void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) { } dnnl::matmul matmul = get_matmul_cache(args); + + auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5); + scratchpad_storage->set_data_handle( + DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data()); + matmul.execute(default_stream(), memory_cache_); default_stream().wait(); } @@ -257,6 +307,8 @@ dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache( return m_size_cache_->get_or_create(key, [&]() { dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false); + auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager(); + manager->realloc(desc.scratchpad_desc().get_size()); return dnnl::matmul(desc); }); } @@ -300,6 +352,11 @@ void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) { dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, default_engine(), nullptr); set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get()); + + memory_cache_[DNNL_ARG_SCRATCHPAD] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(5, memory_cache_[DNNL_ARG_SCRATCHPAD].get()); } dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc( @@ -319,6 +376,9 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc( dnnl::memory::format_tag::ab); dnnl::primitive_attr attr; + + attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + // For PER_TOKEN, scales will be applied in outside epilogue if (a_qs_ == QuantizationStrategy::PER_TENSOR) { attr.set_scales_mask(DNNL_ARG_SRC, 0); @@ -344,3 +404,120 @@ dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc( attr); } } + +MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args) + : DNNLMatMulPrimitiveHandler( + static_cast(args), args.ab_type), + m_size_cache_(nullptr) { + assert(ab_type_ == dnnl::memory::data_type::f32 || + ab_type_ == dnnl::memory::data_type::bf16 || + ab_type_ == dnnl::memory::data_type::f16); + prepack_weight(args.b_ptr, + create_primitive_desc( + MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL, + .a_m_stride = DNNL_RUNTIME_DIM_VAL, + .use_bias = false, + .bias_type = dnnl::memory::data_type::undef}, + true) + .weights_desc()); + init_runtime_memory_cache(args); +} + +static std::shared_ptr +get_matul_class_primitive_cache( + const MatMulPrimitiveHandler::ClassMatmulCacheKey& key, + int64_t cache_size) { + static MatMulPrimitiveHandler::ClassMatmulCache cache(128); + assert(cache_size > 0); + return cache.get_or_create(key, [&]() { + return std::make_shared(cache_size); + }); +} + +void MatMulPrimitiveHandler::execute(ExecArgs& args) { + auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0); + auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1); + a_storage->set_data_handle((void*)args.a_ptr); + a_mem_desc->dims[0] = args.a_m_size; + a_mem_desc->format_desc.blocking.strides[0] = args.a_m_stride; + c_storage->set_data_handle((void*)args.c_ptr); + c_mem_desc->dims[0] = args.a_m_size; + + if (args.use_bias) { + auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2); + bias_storage->set_data_handle((void*)args.bias_ptr); + } + + dnnl::matmul matmul = get_matmul_cache(args); + + auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3); + scratchpad_storage->set_data_handle( + DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data()); + + matmul.execute(default_stream(), memory_cache_); + default_stream().wait(); +} + +dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache( + const MSizeCacheKey& key) { + if (m_size_cache_.get() == nullptr) { + ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_}; + m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_); + } + return m_size_cache_->get_or_create(key, [&]() { + dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false); + auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager(); + manager->realloc(desc.scratchpad_desc().get_size()); + return dnnl::matmul(desc); + }); +} + +dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc( + const MSizeCacheKey& key, bool first_time) { + dnnl::memory::desc a_md; + dnnl::memory::desc b_md; + if (first_time) { + a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_, + dnnl::memory::format_tag::ab); + b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_, + dnnl::memory::format_tag::any); + } else { + a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_, + {key.a_m_stride, 1}); + b_md = b_target_mem_desc_; + } + dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_, + dnnl::memory::format_tag::ab); + + dnnl::primitive_attr attr; + attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + + if (key.use_bias) { + dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1}); + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md, + c_md, attr); + } else { + return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md, + attr); + } +} + +void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) { + memory_cache_[DNNL_ARG_SRC] = dnnl::memory( + {{1, b_k_size_}, b_type_, {b_k_size_, 1}}, default_engine(), nullptr); + set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get()); + memory_cache_[DNNL_ARG_DST] = + dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab}, + default_engine(), nullptr); + set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get()); + + memory_cache_[DNNL_ARG_BIAS] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get()); + + memory_cache_[DNNL_ARG_SCRATCHPAD] = + dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, + default_engine(), nullptr); + set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get()); +} diff --git a/csrc/cpu/dnnl_helper.h b/csrc/cpu/dnnl_helper.h index 54ceefced9e98..ad6773d2b9fd6 100644 --- a/csrc/cpu/dnnl_helper.h +++ b/csrc/cpu/dnnl_helper.h @@ -59,6 +59,30 @@ constexpr inline dnnl::memory::data_type get_dnnl_type() { return DNNLType>::type; } +class DNNLScratchPadManager { + public: + static constexpr size_t allocation_unit = 4 * 1024 * 1024; // 4KB + + static DNNLScratchPadManager* get_dnnl_scratchpad_manager(); + + DNNLScratchPadManager(); + + template + T* get_data() { + return reinterpret_cast(ptr_); + } + + static size_t round(size_t size) { + return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit; + } + + void realloc(size_t new_size); + + private: + size_t size_; + void* ptr_; +}; + class DNNLMatMulPrimitiveHandler { public: virtual ~DNNLMatMulPrimitiveHandler() = default; @@ -166,4 +190,54 @@ class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler { std::shared_ptr m_size_cache_; }; +class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler { + public: + struct Args : public DNNLMatMulPrimitiveHandler::Args { + dnnl::memory::data_type ab_type; + }; + + struct ClassMatmulCacheKey { + dnnl_dim_t b_n_size; + dnnl_dim_t b_k_size; + + friend bool operator==(const ClassMatmulCacheKey& l, + const ClassMatmulCacheKey& r); + }; + + struct MSizeCacheKey { + dnnl_dim_t a_m_size; + dnnl_dim_t a_m_stride; + bool use_bias; + dnnl::memory::data_type bias_type; + + friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r); + }; + + using MSizeCache = DNNLPrimitiveCache; + using ClassMatmulCache = + DNNLPrimitiveCache>; + + struct ExecArgs : public MSizeCacheKey { + const void* a_ptr; + const void* bias_ptr; + void* c_ptr; + }; + + public: + MatMulPrimitiveHandler(const Args& args); + + void execute(ExecArgs& args); + + private: + dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key, + bool first_time); + + void init_runtime_memory_cache(const Args& args); + + dnnl::matmul get_matmul_cache(const MSizeCacheKey& key); + + private: + std::shared_ptr m_size_cache_; +}; + #endif diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp index acc3b9ecde143..1aa99614926df 100644 --- a/csrc/cpu/dnnl_kernels.cpp +++ b/csrc/cpu/dnnl_kernels.cpp @@ -379,6 +379,7 @@ void onednn_scaled_mm( exec_args.a_ptr = a.data_ptr(); exec_args.a_m_size = a.size(0); exec_args.bias_ptr = nullptr; + exec_args.bias_type = get_dnnl_type(); exec_args.use_bias = false; exec_args.a_scales_ptr = nullptr; exec_args.a_zero_points_ptr = nullptr; @@ -492,3 +493,56 @@ void dynamic_scaled_int8_quant( } }); } + +int64_t create_onednn_mm_handler(const torch::Tensor& b, + int64_t primitive_cache_size) { + TORCH_CHECK(b.dim() == 2); + + MatMulPrimitiveHandler::Args args; + args.primitive_cache_size = primitive_cache_size; + + args.b_k_size = b.size(0); + args.b_k_stride = b.stride(0); + args.b_n_size = b.size(1); + args.b_n_stride = b.stride(1); + args.b_ptr = b.data_ptr(); + + VLLM_DISPATCH_FLOATING_TYPES(b.scalar_type(), "create_onednn_mm_handler", + [&] { + args.c_type = get_dnnl_type(); + args.ab_type = get_dnnl_type(); + }); + + return reinterpret_cast(new MatMulPrimitiveHandler(args)); +} + +void onednn_mm(torch::Tensor& c, // [M, OC], row-major + const torch::Tensor& a, // [M, IC], row-major + const std::optional& bias, int64_t handler) { + CPU_KERNEL_GUARD_IN(onednn_mm) + TORCH_CHECK(a.dim() == 2); + TORCH_CHECK(a.stride(-1) == 1); + TORCH_CHECK(c.is_contiguous()); + MatMulPrimitiveHandler* ptr = + reinterpret_cast(handler); + + MatMulPrimitiveHandler::ExecArgs exec_args; + exec_args.a_m_size = a.size(0); + exec_args.a_m_stride = a.stride(0); + + VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] { + if (bias.has_value()) { + exec_args.use_bias = true; + exec_args.bias_type = get_dnnl_type(); + exec_args.bias_ptr = bias->data_ptr(); + } else { + exec_args.use_bias = false; + exec_args.bias_type = get_dnnl_type(); + exec_args.bias_ptr = nullptr; + } + exec_args.a_ptr = a.data_ptr(); + exec_args.c_ptr = c.data_ptr(); + + ptr->execute(exec_args); + }); +} diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index c9f426bdf618a..98c3ebc5a75f8 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -21,6 +21,12 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a, const std::optional& bias, int64_t handler); +int64_t create_onednn_mm_handler(const torch::Tensor& b, + int64_t primitive_cache_size); + +void onednn_mm(torch::Tensor& c, const torch::Tensor& a, + const std::optional& bias, int64_t handler); + void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query, torch::Tensor& kv_cache, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens); @@ -153,6 +159,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("release_dnnl_matmul_handler(int handler) -> ()", &release_dnnl_matmul_handler); + // Create oneDNN GEMM handler + ops.def( + "create_onednn_mm_handler(Tensor b, int " + "primitive_cache_size) -> int", + &create_onednn_mm_handler); + + // oneDNN GEMM + ops.def( + "onednn_mm(Tensor! c, Tensor a, Tensor? bias, " + "int handler) -> ()"); + ops.impl("onednn_mm", torch::kCPU, &onednn_mm); + // Create oneDNN W8A8 handler ops.def( "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType " diff --git a/tests/kernels/test_onednn.py b/tests/kernels/test_onednn.py index 17692384ac9a9..37772464a209b 100644 --- a/tests/kernels/test_onednn.py +++ b/tests/kernels/test_onednn.py @@ -111,6 +111,49 @@ def onednn_int8_gemm_test_helper(primitive_cache_size: int, torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) +def onednn_gemm_test_helper(primitive_cache_size: int, + m: int, + n: int, + k: int, + use_bias: bool, + use_stride: bool, + dtype: torch.dtype = torch.bfloat16, + device: str = "cpu"): + if use_stride: + a = torch.rand((m, 2 * k), dtype=dtype, device=device) * 1.5 + a = a[:, :k] + else: + a = torch.rand((m, k), dtype=dtype, device=device) * 1.5 + + b = torch.rand((n, k), dtype=dtype, device=device) * 1.5 + + if use_bias: + bias = torch.rand((n, ), device=device, dtype=dtype) * 5 + bias_f32 = bias.float() + else: + bias = None + bias_f32 = None + + handler = ops.create_onednn_mm( + b.t(), + primitive_cache_size, + ) + + out = ops.onednn_mm(handler, a, bias) + baseline = torch.nn.functional.linear(a.float(), b.float(), + bias_f32).to(dtype=a.dtype) + + torch.testing.assert_close(out, baseline) + + if use_bias: + # To test runtime bias setting + out = ops.onednn_mm(handler, a, None) + baseline = torch.nn.functional.linear(a.float(), b.float(), + None).to(dtype=a.dtype) + + torch.testing.assert_close(out, baseline) + + @pytest.mark.parametrize("n,k", NK_FACTORS) @pytest.mark.parametrize("m_list", M_FACTORS) @pytest.mark.parametrize("per_tensor_a_scale", [True, False]) @@ -142,3 +185,30 @@ def test_onednn_int8_scaled_gemm( use_azp=use_azp, out_dtype=output_type, ) + + +@pytest.mark.parametrize("n,k", NK_FACTORS) +@pytest.mark.parametrize("m_list", M_FACTORS) +@pytest.mark.parametrize("use_bias", [True, False]) +@pytest.mark.parametrize("use_stride", [True, False]) +@pytest.mark.parametrize("dtype", DTYPE) +@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES) +def test_onednn_gemm( + n: int, + k: int, + m_list: tuple[int], + use_bias: bool, + use_stride: bool, + dtype: torch.dtype, + primitive_cache_size: int, +): + for m in m_list: + onednn_gemm_test_helper( + primitive_cache_size=primitive_cache_size, + m=m, + n=n, + k=k, + use_bias=use_bias, + use_stride=use_stride, + dtype=dtype, + ) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 340d6e1164e4f..bb67d5790aaaa 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1928,6 +1928,35 @@ class CPUDNNLGEMMHandler: torch.ops._C.release_dnnl_matmul_handler(self.handler) +if hasattr(torch.ops._C, "create_onednn_mm_handler"): + _supports_onednn = True +else: + _supports_onednn = False + + +def create_onednn_mm( + weight: torch.Tensor, # [K, N] + primitive_cache_size: int = 128, +) -> CPUDNNLGEMMHandler: + handler = CPUDNNLGEMMHandler() + handler.k, handler.n = weight.size() + handler.handler = torch.ops._C.create_onednn_mm_handler( + weight, primitive_cache_size) + return handler + + +def onednn_mm( + dnnl_handler: CPUDNNLGEMMHandler, + x: torch.Tensor, + bias: Optional[torch.Tensor], +) -> torch.Tensor: + output = torch.empty((*x.shape[0:-1], dnnl_handler.n), dtype=x.dtype) + torch.ops._C.onednn_mm(output, x.reshape(-1, dnnl_handler.k), bias, + dnnl_handler.handler) + + return output + + def create_onednn_scaled_mm( weight: torch.Tensor, # [K, N] weight_scales: torch.Tensor, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index f24c87dbf4509..1224b94d56e06 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -9,7 +9,6 @@ import torch import torch.nn as nn from torch.nn.parameter import Parameter, UninitializedParameter -from vllm import envs from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, @@ -200,26 +199,10 @@ class UnquantizedLinearMethod(LinearMethodBase): set_weight_attrs(weight, extra_weight_attrs) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - # special postprocessing for CPU SGL - if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL: - from vllm.model_executor.layers.utils import check_cpu_sgl_kernel - N, K = layer.weight.size() - dtype = layer.weight.dtype - if check_cpu_sgl_kernel(N, K, dtype): - packed_weight = torch.ops._C.convert_weight_packed( - layer.weight) - assert packed_weight.size() == layer.weight.size() - layer.weight.copy_(packed_weight) - if layer.bias is not None: - layer.bias = Parameter(layer.bias.to(torch.float32), - requires_grad=False) - layer.use_cpu_sgl = True - else: - logger.warning( - "CPU SGL kernels require Intel AMX support," - " bf16/fp16/int8 weight, IC and OC are divisible by " - "32 and 16.") - layer.use_cpu_sgl = False + if current_platform.is_cpu(): + from vllm.model_executor.layers.utils import ( + dispatch_cpu_unquantized_gemm) + dispatch_cpu_unquantized_gemm(layer, remove_weight=True) def apply(self, layer: torch.nn.Module, diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 2897f75b3129e..d2b135c1e4d4e 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -142,20 +142,49 @@ direct_register_custom_op( ) -def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype): +def check_cpu_sgl_kernel(n: int, k: int, dtype: torch.dtype) -> bool: return (torch._C._cpu._is_amx_tile_supported() and (dtype in (torch.bfloat16, torch.int8)) and k % 32 == 0 and n % 16 == 0) +def dispatch_cpu_unquantized_gemm( + layer: torch.nn.Module, + remove_weight: bool, +) -> None: + N, K = layer.weight.size() + dtype = layer.weight.dtype + if envs.VLLM_CPU_SGL_KERNEL and check_cpu_sgl_kernel(N, K, dtype): + packed_weight = torch.ops._C.convert_weight_packed(layer.weight) + if getattr(layer, "bias", None) is not None: + bias_f32 = layer.bias.to(torch.float32) + else: + bias_f32 = None + layer.cpu_linear = ( + lambda x, weight, bias: torch.ops._C.weight_packed_linear( + x, packed_weight, bias_f32 + if bias is not None else None, True)) + if remove_weight: + layer.weight = torch.nn.Parameter(torch.empty(0), + requires_grad=False) + elif ops._supports_onednn: + origin_weight = layer.weight + if remove_weight: + layer.weight = torch.nn.Parameter(torch.empty(0), + requires_grad=False) + handler = ops.create_onednn_mm(origin_weight.t(), 32) + layer.cpu_linear = lambda x, weight, bias: ops.onednn_mm( + handler, x, bias) + else: + layer.cpu_linear = lambda x, weight, bias: torch.nn.functional.linear( + x, weight, bias) + + def cpu_unquantized_gemm(layer: torch.nn.Module, x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None): - if getattr(layer, "use_cpu_sgl", False): - return torch.ops._C.weight_packed_linear(x, weight, bias, True) - else: - return torch.nn.functional.linear(x, weight, bias) + return layer.cpu_linear(x, weight, bias) def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]: diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 9f223998e554f..c92a7978195bc 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -40,6 +40,12 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase): layer.register_parameter("weight", weight) set_weight_attrs(weight, extra_weight_attrs) + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + if current_platform.is_cpu(): + from vllm.model_executor.layers.utils import ( + dispatch_cpu_unquantized_gemm) + dispatch_cpu_unquantized_gemm(layer, remove_weight=False) + def apply(self, layer: torch.nn.Module, x: torch.Tensor, From 12e1e63cc5565c5926a2d3454d3b31a1ee03f564 Mon Sep 17 00:00:00 2001 From: Weida Hong Date: Thu, 4 Sep 2025 14:38:26 +0800 Subject: [PATCH 72/95] [Misc] Enhance output readability of helper script (#24214) Signed-off-by: Weida Hong --- benchmarks/auto_tune/auto_tune.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 82c20ffa6554c..d9d0fe4e0ccba 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -213,7 +213,7 @@ run_benchmark() { pkill -if vllm sleep 10 - printf '=%.0s' $(seq 1 20) + echo "====================" return 0 } From e7fc70016fed270940759dc753a726790701a0c1 Mon Sep 17 00:00:00 2001 From: bingchen-mi Date: Thu, 4 Sep 2025 15:08:09 +0800 Subject: [PATCH 73/95] [Model] Add MiDashengLM model support (#23652) Signed-off-by: chenbing8 Signed-off-by: bingchen-mi Co-authored-by: Jee Jee Li Co-authored-by: Isotr0py --- docs/models/supported_models.md | 1 + examples/offline_inference/audio_language.py | 31 + .../multimodal/processing/test_common.py | 1 + tests/models/registry.py | 2 + vllm/model_executor/models/midashenglm.py | 788 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/config.py | 1 + vllm/transformers_utils/configs/__init__.py | 2 + .../transformers_utils/configs/midashenglm.py | 101 +++ 9 files changed, 928 insertions(+) create mode 100644 vllm/model_executor/models/midashenglm.py create mode 100644 vllm/transformers_utils/configs/midashenglm.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7f54d98527686..c8f628d31abf5 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -643,6 +643,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + IE+ | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ | | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ | | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ | +| `MiDashengLMModel` | MiDashengLM | T + A+ | `mispeech/midashenglm-7b` | | ✅︎ | ✅︎ | | `MiniCPMO` | MiniCPM-O | T + IE+ + VE+ + AE+ | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | | `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ | | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 22cb8b057dac7..a5b8397e7e7ff 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -146,6 +146,36 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData: ) +# MiDashengLM +def run_midashenglm(question: str, audio_count: int): + model_name = "mispeech/midashenglm-7b" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}, + ) + + audio_in_prompt = "".join( + ["<|audio_bos|><|AUDIO|><|audio_eos|>" for idx in range(audio_count)] + ) + + default_system = "You are a helpful language and speech assistant." + + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n" + ) + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + ) + + # MiniCPM-O def run_minicpmo(question: str, audio_count: int) -> ModelRequestData: model_name = "openbmb/MiniCPM-o-2_6" @@ -352,6 +382,7 @@ model_example_map = { "voxtral": run_voxtral, "gemma3n": run_gemma3n, "granite_speech": run_granite_speech, + "midashenglm": run_midashenglm, "minicpmo": run_minicpmo, "phi4_mm": run_phi4mm, "phi4_multimodal": run_phi4_multimodal, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 16c0428c6d8f1..8ffd65cf087be 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -302,6 +302,7 @@ def _test_processing_correctness_one( "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "meta-llama/Llama-3.2-11B-Vision-Instruct", "TIGER-Lab/Mantis-8B-siglip-llama3", + "mispeech/midashenglm-7b", "openbmb/MiniCPM-Llama3-V-2_5", "openbmb/MiniCPM-o-2_6", "openbmb/MiniCPM-V-2_6", diff --git a/tests/models/registry.py b/tests/models/registry.py index f1f61c6151349..c9e2eec5117d4 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -460,6 +460,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { max_transformers_version="4.48", # noqa: E501 transformers_version_reason="HF model is not compatible.", # noqa: E501 hf_overrides={"architectures": ["MantisForConditionalGeneration"]}), # noqa: E501 + "MiDashengLMModel": _HfExamplesInfo("mispeech/midashenglm-7b", + trust_remote_code=True), "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", trust_remote_code=True), "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py new file mode 100644 index 0000000000000..858d4e7e34cf1 --- /dev/null +++ b/vllm/model_executor/models/midashenglm.py @@ -0,0 +1,788 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright 2025 Horizon team, Xiaomi MiLM Plus. +# Copyright 2024 The Qwen team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiDashengLM model compatible with HuggingFace weights.""" +import collections +import collections.abc +from collections.abc import Iterable, Mapping, Sequence +from typing import Any, Callable, Optional, TypedDict, Union, cast + +import numpy as np +import torch +import torch.nn as nn +import torchaudio.transforms as audio_transforms +from transformers import BatchFeature + +from vllm.attention.layer import MultiHeadAttention +from vllm.config import VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.utils import set_default_torch_dtype +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargsItems) +from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptUpdate, PromptUpdateDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.midashenglm import DashengConfig + +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .utils import (AutoWeightsLoader, init_vllm_registered_model, + maybe_prefix, merge_multimodal_embeddings) + +_Tuple2 = Union[int, tuple[int, int], Sequence[int]] + + +def _resolve_tuple2(x: _Tuple2) -> tuple[int, int]: + if isinstance(x, collections.abc.Sequence): + assert len(x) == 2, ( + f"Expected a sequence of length 2, got {x} with length {len(x)}") + return cast(tuple[int, int], tuple(x)) + return (x, x) + + +def calculate_mel_frames_dasheng( + audio_length_samples: int, + n_fft: int = 512, + hop_size: int = 160, + dasheng_subsampling: int = 4, + center=True, + model_subsampling: int = 5, +) -> int: + """Calculate the number of Mel-spectrogram frames.""" + if center: + audio_length_samples = audio_length_samples + n_fft + + return (int(1 + ((audio_length_samples - n_fft) / hop_size)) // + dasheng_subsampling // model_subsampling) + + +class AudioPatchEmbed(nn.Module): + + def __init__( + self, + input_size: _Tuple2 = 64, + patch_size: _Tuple2 = 16, + patch_stride: _Tuple2 = 16, + in_chans: int = 1, + embed_dim: int = 768, + norm_layer: Optional[Callable] = None, + flatten: bool = False, + ): + super().__init__() + self.input_size = _resolve_tuple2(input_size) + self.patch_size = _resolve_tuple2(patch_size) + self.patch_stride = _resolve_tuple2(patch_stride) + self.grid_size = ( + self.input_size[0] // self.patch_stride[0], + self.input_size[1] // self.patch_stride[1], + ) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + self.proj = nn.Conv2d( + in_chans, + embed_dim, + kernel_size=self.patch_size, + stride=self.patch_stride, + ) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + if self.flatten: + x = torch.permute(torch.flatten( + x, 2, 3), (0, 2, 1)) # rearrange(x, "b c f t -> b (f t) c") + x = self.norm(x) + return x + + +class LayerScale(nn.Module): + + def __init__(self, dim, init_values=1e-5, inplace=False): + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x.mul_(self.gamma) if self.inplace else x * self.gamma + + +class DashengMlp(nn.Module): + + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = ColumnParallelLinear(input_size=in_features, + output_size=hidden_features, + quant_config=quant_config, + prefix=f"{prefix}.fc1") + self.act = get_act_fn("gelu") + self.fc2 = RowParallelLinear(input_size=hidden_features, + output_size=out_features, + quant_config=quant_config, + prefix=f"{prefix}.fc2") + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, _ = self.fc1(x) + x = self.act(x) + x, _ = self.fc2(x) + return x + + +class DashengAttention(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + causal: bool = False, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.embed_dim = dim + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + if self.total_num_heads >= tp_size: + # Number of heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_heads % tp_size == 0 + else: + # Number of heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_heads == 0 + self.num_kv_heads = max(1, self.total_num_heads // tp_size) + self.head_dim = self.embed_dim // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scale = self.head_dim**-0.5 + + self.qkv = QKVParallelLinear( + hidden_size=self.embed_dim, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv", + ) + self.attn = MultiHeadAttention( + self.num_heads, + self.head_dim, + self.scale, + num_kv_heads=self.num_kv_heads, + ) + self.proj = RowParallelLinear( + input_size=dim, + output_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.proj", + ) + self.causal = causal + + def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None): + B, N, C = x.shape + + qkv_out, _ = self.qkv(x) + q, k, v = qkv_out.split([self.q_size, self.kv_size, self.kv_size], + dim=-1) + + attn_out = self.attn(q, k, v) + C_local = attn_out.numel() // (B * N) # C_local for parallel + attn_out = attn_out.view(B, N, C_local) + + x, _ = self.proj(attn_out) + + return x + + +class DashengBlock(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + init_values: Optional[float] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.norm1 = nn.LayerNorm(dim, eps=1e-6) + self.attn = DashengAttention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + self.ls1 = (LayerScale(dim, init_values=init_values) + if init_values else nn.Identity()) + + self.norm2 = nn.LayerNorm(dim, eps=1e-6) + self.mlp = DashengMlp( + in_features=dim, + hidden_features=int(dim * mlp_ratio), + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.ls2 = (LayerScale(dim, init_values=init_values) + if init_values else nn.Identity()) + + # Kwargs usually has a mask parameter that is passed to Attention + def forward( + self, + x: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + x = x + self.ls1(self.attn(self.norm1(x), mask)) + x = x + self.ls2(self.mlp(self.norm2(x))) + return x + + +class DashengAudioTransformer(nn.Module): + + def __init__( + self, + config: DashengConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + + self.target_length = config.target_length + self.hop_length = config.hop_length + + self._init_front_end(config) + + self.init_bn = nn.BatchNorm2d(config.n_mels, momentum=0.01) + + self.patch_embed = AudioPatchEmbed( + input_size=(config.n_mels, config.target_length), + embed_dim=config.embed_dim, + in_chans=config.input_channels, + patch_size=config.patch_size, + flatten=False, + patch_stride=config.patch_stride, + ) + + self.time_pos_embed = nn.Parameter( + torch.empty(1, config.embed_dim, 1, self.patch_embed.grid_size[1])) + self.freq_pos_embed = nn.Parameter( + torch.empty(1, config.embed_dim, self.patch_embed.grid_size[0], 1)) + self.blocks = nn.ModuleList( + DashengBlock( + dim=config.embed_dim, + num_heads=config.num_heads, + mlp_ratio=config.mlp_ratio, + qkv_bias=config.qkv_bias, + init_values=config.init_values, + quant_config=quant_config, + prefix=f"{prefix}.block{i}", + ) for i in range(config.depth)) + self.norm = nn.LayerNorm(config.embed_dim, eps=1e-6) + + def _init_front_end(self, config): + with set_default_torch_dtype(torch.float32): + self.front_end = nn.Sequential( + audio_transforms.MelSpectrogram( + f_min=config.f_min, + f_max=config.f_max, + center=config.center, + win_length=config.win_length, + hop_length=config.hop_length, + sample_rate=config.sample_rate, + n_fft=config.n_fft, + n_mels=config.n_mels, + ), + audio_transforms.AmplitudeToDB(top_db=120), + ) + + mel_spectrogram = self.front_end[0] + fb = mel_spectrogram.mel_scale.fb + win = mel_spectrogram.spectrogram.window + mel_spectrogram.mel_scale.fb = fb.to(torch.bfloat16).to( + torch.float32) + mel_spectrogram.spectrogram.window = win.to(torch.bfloat16).to( + torch.float32) + + def forward_features( + self, + x: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + t = x.shape[-1] + x = x + self.time_pos_embed[:, :, :, :t] + x = (x + self.freq_pos_embed[:, :, :, :] + ) # Just to support __getitem__ in posembed + x = torch.permute(torch.flatten(x, 2, 3), + (0, 2, 1)) # rearrange(x, "b c f t -> b (f t) c") + for block in self.blocks: + x = block(x, mask) + x = self.norm(x) + return x + + def _to_mask(self, lengths: torch.Tensor, max_length: int) -> torch.Tensor: + batch_size = len(lengths) + idx = torch.arange(max_length, device=lengths.device) + idx = idx.repeat(batch_size).view(batch_size, max_length) + mask = (idx < lengths.unsqueeze(-1)).bool() + return mask + + def forward( + self, + x: torch.Tensor, + x_length: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + x = self.front_end(x) + x = x.to(self.time_pos_embed.dtype) + target_length_in_patches = self.target_length // 4 + x = x.unsqueeze(1) + x = torch.permute(x, (0, 2, 1, 3)) + x = self.init_bn(x) + x = torch.permute(x, (0, 2, 1, 3)) + + x = self.patch_embed(x) + t = x.shape[-1] + + input_splits = x.split(target_length_in_patches, dim=-1) + + if x_length is not None: + assert len(x_length) == len(x), ( + "batchsizes of input x and x_length need to be same") + assert x_length.ndim == 1, "Lengths are of size (B,)" + scaled_lengths = (x_length / (self.hop_length * 4)).long() + mask = self._to_mask(max_length=t, lengths=scaled_lengths) + split_masks = mask.logical_not().split(target_length_in_patches, + dim=-1) + else: + mask = None + split_masks = [None] * len(input_splits) + + outputs = [] + + for split_x, split_mask in zip(input_splits, split_masks): + forward_kwargs = {} + forward_kwargs["mask"] = split_mask + split_x = self.forward_features(split_x, **forward_kwargs) + outputs.append(split_x) + x = torch.cat(outputs, dim=1) + return x, mask + + +class AudioProjectorSubsample(nn.Module): + + def __init__( + self, + in_dim: int, + out_dim: int, + downsample_rate=5, + dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.k = downsample_rate + self.net = nn.Sequential( + ColumnParallelLinear( + input_size=in_dim * self.k, + output_size=out_dim, + quant_config=quant_config, + prefix=f"{prefix}.net.0", + return_bias=False, + ), get_act_fn("gelu"), + RowParallelLinear( + input_size=out_dim, + output_size=out_dim, + quant_config=quant_config, + prefix=f"{prefix}.net.2", + return_bias=False, + )) + + def forward(self, x, mask=None): + batch_size, seq_len, dim = x.shape + num_frames_to_discard = seq_len % self.k + if num_frames_to_discard > 0: + x = x[:, :-num_frames_to_discard, :] + if mask is not None: + mask = mask[:, :-num_frames_to_discard] + if mask is None: + mask = torch.ones(x.shape[:-1], dtype=torch.long, device=x.device) + x = x.reshape(batch_size, -1, self.k * + dim) # rearrange(x, "b (s k) d -> b s (k d)", k=self.k) + for layer in self.net: + x = layer(x) + mask = mask.reshape( + batch_size, -1, + self.k) # rearrange(mask, "b (s k) -> b s k", k=self.k) + mask = mask.any(dim=-1).long() + return x, mask + + +# === Audio Inputs === # +class MiDashengLMAudioInputs(TypedDict): + input_values: torch.Tensor + """Shape: `(num_audios, num_sampling_points)`""" + audio_length: torch.Tensor + """Shape: `(num_audios, 1)`""" + + +class MiDashengLMProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self): + return self.ctx.get_hf_config() + + def get_feature_extractor(self): + hf_processor = self.get_hf_processor() + feature_extractor = hf_processor.feature_extractor + return feature_extractor + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"audio": None} + + def get_min_audio_len(self): + return 3200 + + def get_max_audio_len(self): + return 160000 + + +class MiDashengLMDummyInputsBuilder( + BaseDummyInputsBuilder[MiDashengLMProcessingInfo]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_audios = mm_counts.get("audio", 0) + + hf_processor = self.info.get_hf_processor() + audio_token = hf_processor.audio_token + + return audio_token * num_audios + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_audios = mm_counts.get("audio", 0) + + return { + "audio": + self._get_dummy_audios(length=self.info.get_max_audio_len(), + num_audios=num_audios) + } + + +class MiDashengLMMultiModalProcessor( + BaseMultiModalProcessor[MiDashengLMProcessingInfo]): + + def _get_data_parser(self) -> MultiModalDataParser: + feature_extractor = self.info.get_feature_extractor() + return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, Any], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + audios = mm_data.pop("audios", []) + + # + Padding + min_audio_len = self.info.get_min_audio_len() + processed_audios = [ + np.pad(audio, (0, min_audio_len - audio.shape[-1]), + mode='constant', + constant_values=0) if isinstance(audio, np.ndarray) + and audio.shape[-1] < min_audio_len else audio for audio in audios + ] + + if processed_audios: + mm_data["audio"] = processed_audios + + if not mm_data.get("audio", []): + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + mm_kwargs = dict(**mm_kwargs, ) + + return super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + input_values=MultiModalFieldConfig.batched("audio"), + audio_length=MultiModalFieldConfig.batched("audio"), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + audio_token = getattr(processor, "audio_token", "<|AUDIO|>") + audio_bos_token = getattr(processor, "audio_bos_token", + "<|audio_bos|>") + audio_eos_token = getattr(processor, "audio_eos_token", + "<|audio_eos|>") + + audio_token_id = vocab[audio_token] + audio_bos_id = vocab[audio_bos_token] + audio_eos_id = vocab[audio_eos_token] + + out_mm_data = out_mm_kwargs.get_data() + audio_length = out_mm_data.get("audio_length") + if audio_length is None: + audio_output_lengths = [] + else: + audio_length_np = audio_length.cpu().numpy() if isinstance( + audio_length, torch.Tensor) else audio_length + audio_output_lengths = [ + max(1, calculate_mel_frames_dasheng( + int(length))) # at least one frame + for length in audio_length_np + ] + + def get_replacement_midashenglm(item_idx: int): + num_features = audio_output_lengths[item_idx] + audio_tokens = [audio_token_id] * num_features + + return PromptUpdateDetails.select_token_id( + [audio_bos_id] + audio_tokens + [audio_eos_id], + embed_token_id=audio_token_id, + ) + + return [ + PromptReplacement( + modality="audio", + target=audio_token, + replacement=get_replacement_midashenglm, + ) + ] + + +@MULTIMODAL_REGISTRY.register_processor( + MiDashengLMMultiModalProcessor, + info=MiDashengLMProcessingInfo, + dummy_inputs=MiDashengLMDummyInputsBuilder, +) +class MiDashengLMModel(nn.Module, SupportsMultiModal, SupportsPP): + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + if modality.startswith("audio"): + return "<|audio_bos|><|AUDIO|><|audio_eos|>" + + raise ValueError("Only audio modality is supported") + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + + # Initialize audio components + self.audio_encoder = DashengAudioTransformer( + config.audio_encoder_config, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "audio_encoder"), + ) + self.audio_projector = AudioProjectorSubsample( + in_dim=config.audio_encoder_config.embed_dim, + out_dim=config.text_config.hidden_size, + downsample_rate=config.subsample_factor, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "audio_projector"), + ) + + # Initialize language model (decoder) + self.decoder = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "decoder"), + architectures=["Qwen2ForCausalLM"], + ) + + self.quant_config = quant_config + self.make_empty_intermediate_tensors = ( + self.decoder.make_empty_intermediate_tensors) + + def _validate_and_reshape_mm_tensor(self, mm_input: object, + name: str) -> torch.Tensor: + if not isinstance(mm_input, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of {name}. " + f"Got type: {type(mm_input)}") + if isinstance(mm_input, torch.Tensor): + return torch.concat(list(mm_input)) + else: + return torch.concat(mm_input) + + def _parse_and_validate_audio_input( + self, **kwargs: object) -> Optional[MiDashengLMAudioInputs]: + input_values = kwargs.pop("input_values", None) + audio_length = kwargs.pop("audio_length", None) + + if input_values is None: + return None + input_values = self._validate_and_reshape_mm_tensor( + input_values, "input_values") + audio_length = self._validate_and_reshape_mm_tensor( + audio_length, "audio_length") + if not isinstance(input_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of audio input features. " + f"Got type: {type(input_values)}") + + return MiDashengLMAudioInputs( + input_values=input_values, + audio_length=audio_length, + ) + + def _process_audio_input( + self, audio_input: MiDashengLMAudioInputs) -> torch.Tensor: + # Process audio through encoder and projector + input_values = audio_input["input_values"] + audio_length = audio_input["audio_length"] + + encoder_out, encoder_atts = self.audio_encoder(input_values, + audio_length) + audio_embeddings, _ = self.audio_projector(encoder_out, encoder_atts) + audio_embeddings = audio_embeddings.to( + audio_input["input_values"].dtype) + batch_size, max_audio_tokens, embed_dim = audio_embeddings.shape + + audio_length_np = audio_length.cpu().numpy() if isinstance( + audio_length, torch.Tensor) else audio_length + audio_output_lengths = [ + max(1, calculate_mel_frames_dasheng( + int(length))) # at least one frame + for length in audio_length_np + ] + audio_output_lengths = torch.tensor(audio_output_lengths).to( + audio_embeddings.device) + + audio_feature_mask = (torch.arange( + max_audio_tokens, + device=audio_embeddings.device).unsqueeze(0).expand( + batch_size, max_audio_tokens) + < audio_output_lengths.unsqueeze(1)) + + masked_audio_features = audio_embeddings[audio_feature_mask].view( + -1, embed_dim) + + return torch.split(masked_audio_features, + audio_output_lengths.tolist()) + + def get_language_model(self) -> torch.nn.Module: + return self.decoder + + def get_multimodal_embeddings(self, + **kwargs: object) -> MultiModalEmbeddings: + audio_input = self._parse_and_validate_audio_input(**kwargs) + + if audio_input is None: + return [] + return self._process_audio_input(audio_input) + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.decoder.get_input_embeddings(input_ids) + if multimodal_embeddings and len(multimodal_embeddings) > 0: + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + multimodal_embeddings, + self.config.audio_token_id, + ) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + inputs_embeds = None + elif inputs_embeds is None: + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + multimodal_embeddings) + input_ids = None + + return self.decoder.model(input_ids, + positions, + intermediate_tensors, + inputs_embeds=inputs_embeds) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.decoder.compute_logits(hidden_states, sampling_metadata) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f236040bb2341..feca60f2f001e 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -236,6 +236,7 @@ _MULTIMODAL_MODELS = { "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501 "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501 + "MiDashengLMModel": ("midashenglm", "MiDashengLMModel"), "MiniMaxVL01ForConditionalGeneration": ("minimax_vl_01", "MiniMaxVL01ForConditionalGeneration"), # noqa: E501 "MiniCPMO": ("minicpmo", "MiniCPMO"), "MiniCPMV": ("minicpmv", "MiniCPMV"), diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index bec792465bfbb..95e4ed1ccf07f 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -71,6 +71,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( jais="JAISConfig", mlp_speculator="MLPSpeculatorConfig", medusa="MedusaConfig", + midashenglm="MiDashengLMConfig", eagle="EAGLEConfig", speculators="SpeculatorsConfig", nemotron="NemotronConfig", diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 8339c55bcf808..f651ecb078b95 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -17,6 +17,7 @@ from vllm.transformers_utils.configs.falcon import RWConfig from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig from vllm.transformers_utils.configs.medusa import MedusaConfig +from vllm.transformers_utils.configs.midashenglm import MiDashengLMConfig from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.moonvit import MoonViTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig @@ -36,6 +37,7 @@ __all__ = [ "RWConfig", "JAISConfig", "MedusaConfig", + "MiDashengLMConfig", "MLPSpeculatorConfig", "MoonViTConfig", "KimiVLConfig", diff --git a/vllm/transformers_utils/configs/midashenglm.py b/vllm/transformers_utils/configs/midashenglm.py new file mode 100644 index 0000000000000..1c23202e23c8e --- /dev/null +++ b/vllm/transformers_utils/configs/midashenglm.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright 2025 Horizon team, Xiaomi MiLM Plus. +# Copyright 2024 The Qwen team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional, Union + +from transformers import PretrainedConfig +from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import ( + Qwen2_5OmniTextConfig) + + +class DashengConfig(PretrainedConfig): + model_type = "midashenglm_dasheng_encoder" + + def __init__( + self, + embed_dim: int = 768, + outputdim: int = 527, + patch_size: Union[int, tuple[int, int]] = 16, + patch_stride: Union[int, tuple[int, int]] = 16, + input_channels: int = 1, + target_length: int = 1012, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + init_values: Optional[float] = None, + drop_rate: float = 0.0, + attn_drop_rate: float = 0.0, + f_min: float = 0.0, + f_max: float = 8000.0, + center: bool = True, + win_length: int = 512, + hop_length: int = 160, + sample_rate: int = 16000, + n_fft: int = 512, + n_mels: int = 64, + **kwargs, + ): + self.embed_dim = embed_dim + self.outputdim = outputdim + self.patch_size = patch_size + self.patch_stride = patch_stride + self.input_channels = input_channels + self.target_length = target_length + self.depth = depth + self.num_heads = num_heads + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.init_values = init_values + self.drop_rate = drop_rate + self.attn_drop_rate = attn_drop_rate + self.f_min = f_min + self.f_max = f_max + self.center = center + self.win_length = win_length + self.hop_length = hop_length + self.sample_rate = sample_rate + self.n_fft = n_fft + self.n_mels = n_mels + super().__init__(**kwargs) + + +class MiDashengLMConfig(PretrainedConfig): + model_type = "midashenglm" + + def __init__( + self, + audio_encoder_config: Optional[dict] = None, + subsample_factor: int = 5, + text_config: Optional[dict] = None, + audio_token_id: Optional[int] = None, + **kwargs, + ): + self.audio_encoder_config = DashengConfig( + **(audio_encoder_config or {})) + self.subsample_factor = subsample_factor + self.text_config = (Qwen2_5OmniTextConfig( + **text_config) if text_config else Qwen2_5OmniTextConfig()) + self.text_config.rope_scaling = None # uses_mrope is false + self.audio_token_id = audio_token_id + super().__init__(**kwargs) From 51d5e9be7dbf4d914374447548dd01f9bfb68f89 Mon Sep 17 00:00:00 2001 From: mgazz Date: Thu, 4 Sep 2025 08:22:41 +0100 Subject: [PATCH 74/95] [Core][Model] Terratorch backend integration (#23513) Signed-off-by: Michele Gazzetti Signed-off-by: Christian Pinto Co-authored-by: Christian Pinto Co-authored-by: Cyrus Leung --- .../prithvi_geospatial_mae.py | 6 +- .../prithvi_geospatial_mae_io_processor.py | 1 + .../online_serving/prithvi_geospatial_mae.py | 1 + requirements/test.in | 2 +- requirements/test.txt | 2 +- tests/distributed/test_pipeline_parallel.py | 6 + tests/distributed/test_sequence_parallel.py | 3 + .../entrypoints/openai/test_chat_template.py | 4 +- .../entrypoints/openai/test_skip_tokenizer.py | 6 +- tests/entrypoints/test_chat_utils.py | 12 +- .../multimodal/generation/vlm_utils/core.py | 3 + .../multimodal/pooling/test_prithvi_mae.py | 2 +- .../multimodal/processing/test_common.py | 4 +- .../processing/test_tensor_schema.py | 4 +- tests/models/multimodal/test_mapping.py | 4 +- tests/models/registry.py | 40 ++- tests/models/test_initialization.py | 5 +- tests/models/test_terratorch.py | 45 ++++ tests/models/utils.py | 2 + .../test_io_processor_plugins.py | 103 ++++---- vllm/config/__init__.py | 5 +- vllm/model_executor/models/registry.py | 15 +- ...rithvi_geospatial_mae.py => terratorch.py} | 238 ++++++++---------- 23 files changed, 305 insertions(+), 208 deletions(-) create mode 100644 tests/models/test_terratorch.py rename vllm/model_executor/models/{prithvi_geospatial_mae.py => terratorch.py} (52%) diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index b6007b9f46301..1a5879a6d35f5 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -45,7 +45,11 @@ datamodule_config = { class PrithviMAE: def __init__(self, model): self.model = LLM( - model=model, skip_tokenizer_init=True, dtype="float16", enforce_eager=True + model=model, + skip_tokenizer_init=True, + dtype="float16", + enforce_eager=True, + model_impl="terratorch", ) def run(self, input_data, location_coords): diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py index adc27859a1cdd..5d629fabf0a27 100644 --- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py +++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py @@ -37,6 +37,7 @@ def main(): # The maximum number depends on the available GPU memory max_num_seqs=32, io_processor_plugin="prithvi_to_tiff_india", + model_impl="terratorch", ) pooling_params = PoolingParams(task="encode", softmax=False) diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/prithvi_geospatial_mae.py index 359162c470f08..c6eed64838ea4 100644 --- a/examples/online_serving/prithvi_geospatial_mae.py +++ b/examples/online_serving/prithvi_geospatial_mae.py @@ -15,6 +15,7 @@ import requests # https://github.com/christian-pinto/prithvi_io_processor_plugin # - start vllm in serving mode with the below args # --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM' +# --model-impl terratorch # --task embed --trust-remote-code # --skip-tokenizer-init --enforce-eager # --io-processor-plugin prithvi_to_tiff_india diff --git a/requirements/test.in b/requirements/test.in index 5b1688c76c954..5db9cd797904f 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -53,5 +53,5 @@ runai-model-streamer==0.11.0 runai-model-streamer-s3==0.11.0 fastsafetensors>=0.1.10 pydantic>=2.10 # 2.9 leads to error on python 3.10 -terratorch==1.1rc2 # required for PrithviMAE test decord==0.6.0 +terratorch==1.1rc3 # required for PrithviMAE test diff --git a/requirements/test.txt b/requirements/test.txt index 0b728ebfb0071..332a9b9cfbf59 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1042,7 +1042,7 @@ tensorboardx==2.6.4 # via lightning tensorizer==2.10.1 # via -r requirements/test.in -terratorch==1.1rc2 +terratorch==1.1rc3 # via -r requirements/test.in threadpoolctl==3.5.0 # via scikit-learn diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 1afe9ea970c97..fffab1a984c26 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -298,6 +298,8 @@ def _compare_tp( tokenizer_mode = model_info.tokenizer_mode hf_overrides = model_info.hf_overrides hf_config = get_config(model_id, trust_remote_code) + skip_tokenizer_init = model_info.skip_tokenizer_init + max_num_seqs = model_info.max_num_seqs dtype = "float16" if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS: @@ -351,6 +353,10 @@ def _compare_tp( common_args.extend(["--load-format", load_format]) if hf_overrides: common_args.extend(["--hf-overrides", json.dumps(hf_overrides)]) + if skip_tokenizer_init: + common_args.append("--skip-tokenizer-init") + if max_num_seqs: + common_args.extend(["--max-num-seqs", f"{max_num_seqs}"]) specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill testing_ray_compiled_graph = False diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index c93b436f384b9..65c5e68968440 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -178,6 +178,7 @@ def _compare_sp( trust_remote_code = model_info.trust_remote_code tokenizer_mode = model_info.tokenizer_mode hf_overrides = model_info.hf_overrides + skip_tokenizer_init = model_info.skip_tokenizer_init if load_format == "dummy": # Avoid OOM @@ -227,6 +228,8 @@ def _compare_sp( common_args.extend(["--load-format", load_format]) if hf_overrides: common_args.extend(["--hf-overrides", json.dumps(hf_overrides)]) + if skip_tokenizer_init: + common_args.append("--skip-tokenizer-init") compilation_config = { 'level': 3, diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index 5b6e2a4146b1f..ce90a67c01517 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -104,7 +104,9 @@ def test_get_gen_prompt(model, template, add_generation_prompt, trust_remote_code=model_info.trust_remote_code, revision=model_info.revision, hf_overrides=model_info.hf_overrides, - ) + skip_tokenizer_init=model_info.skip_tokenizer_init, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype) # Initialize the tokenizer tokenizer = get_tokenizer( diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py index 0bb42ed8aa7fb..af520ac61d8df 100644 --- a/tests/entrypoints/openai/test_skip_tokenizer.py +++ b/tests/entrypoints/openai/test_skip_tokenizer.py @@ -11,7 +11,7 @@ import torch from ...utils import RemoteOpenAIServer -MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM" +MODEL_NAME = "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11" DTYPE = "float16" @@ -35,7 +35,9 @@ def server(): "--trust-remote-code", "--skip-tokenizer-init", "--max-num-seqs", - "32" + "32", + "--model-impl", + "terratorch" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 0c1f19371a160..18db1027c004d 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -1266,7 +1266,9 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, - ) + skip_tokenizer_init=model_info.skip_tokenizer_init, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype) # Build the tokenizer group and grab the underlying tokenizer tokenizer_group = TokenizerGroup( @@ -1322,7 +1324,9 @@ def test_resolve_content_format_hf_defined(model, expected_format): revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, - ) + skip_tokenizer_init=model_info.skip_tokenizer_init, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype) tokenizer_group = TokenizerGroup( model, @@ -1382,7 +1386,9 @@ def test_resolve_content_format_fallbacks(model, expected_format): revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, - ) + skip_tokenizer_init=model_info.skip_tokenizer_init, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype) tokenizer_group = TokenizerGroup( model_config.tokenizer, diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index a5d6948f06efd..ae70838336957 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -69,6 +69,9 @@ def run_test( vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode if model_info.hf_overrides: vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides + if model_info.skip_tokenizer_init: + vllm_runner_kwargs_[ + "skip_tokenizer_init"] = model_info.skip_tokenizer_init if vllm_runner_kwargs: vllm_runner_kwargs_.update(vllm_runner_kwargs) diff --git a/tests/models/multimodal/pooling/test_prithvi_mae.py b/tests/models/multimodal/pooling/test_prithvi_mae.py index e9be79fba911f..b503d42567022 100644 --- a/tests/models/multimodal/pooling/test_prithvi_mae.py +++ b/tests/models/multimodal/pooling/test_prithvi_mae.py @@ -46,7 +46,7 @@ def _run_test( vllm_model.encode(prompt) -MODELS = ["christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"] +MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"] @pytest.mark.core_model diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 8ffd65cf087be..ced0ab3377a9e 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -66,7 +66,9 @@ def _test_processing_correctness( hf_overrides=model_info.hf_overrides, # Ensure that the cache can fit all of the data mm_processor_cache_gb=2048, - ) + skip_tokenizer_init=model_info.skip_tokenizer_init, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype) model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 615564f70ea31..b678313752d65 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -196,7 +196,9 @@ def test_model_tensor_schema(model_arch: str, model_id: str): revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=hf_overrides_fn, - ) + skip_tokenizer_init=model_info.skip_tokenizer_init, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype) model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py index 7096810d8e15c..caf1966ab513f 100644 --- a/tests/models/multimodal/test_mapping.py +++ b/tests/models/multimodal/test_mapping.py @@ -59,7 +59,9 @@ def test_hf_model_weights_mapper(model_arch: str): revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, - ) + skip_tokenizer_init=model_info.skip_tokenizer_init, + enforce_eager=model_info.enforce_eager, + dtype=model_info.dtype) model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) original_weights = create_repo_dummy_weights(model_id) diff --git a/tests/models/registry.py b/tests/models/registry.py index c9e2eec5117d4..38efb01341ebe 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -6,10 +6,11 @@ from dataclasses import dataclass, field from typing import Any, Literal, Optional import pytest +import torch from packaging.version import Version from transformers import __version__ as TRANSFORMERS_VERSION -from vllm.config import TokenizerMode +from vllm.config import ModelDType, TokenizerMode @dataclass(frozen=True) @@ -47,6 +48,23 @@ class _HfExamplesInfo: The reason for the minimum/maximum version requirement. """ + skip_tokenizer_init: bool = False + """ + If true, skip initialization of tokenizer and detokenizer. + """ + + dtype: ModelDType = "auto" + """ + The data type for the model weights and activations. + """ + + enforce_eager: bool = False + """ + Whether to enforce eager execution. If True, we will + disable CUDA graph and always execute the model in eager mode. + If False, we will use CUDA graph and eager execution in hybrid. + """ + is_available_online: bool = True """ Set this to ``False`` if the name of this architecture no longer exists on @@ -76,6 +94,9 @@ class _HfExamplesInfo: If not specified, the default revision will be used. """ + max_num_seqs: Optional[int] = None + """Maximum number of sequences to be processed in a single iteration.""" + def check_transformers_version( self, *, @@ -361,8 +382,21 @@ _EMBEDDING_EXAMPLE_MODELS = { "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full", trust_remote_code=True), "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501 - "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501 - is_available_online=False), # noqa: E501 + "PrithviGeoSpatialMAE": _HfExamplesInfo("mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501 + dtype=torch.float16, + enforce_eager=True, + skip_tokenizer_init=True, + # This is to avoid the model + # going OOM in CI + max_num_seqs=32, + ), + "Terratorch": _HfExamplesInfo("mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11", + dtype=torch.float16, + enforce_eager=True, + skip_tokenizer_init=True, + # This is to avoid the model going OOM in CI + max_num_seqs=32, + ), } _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = { diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index b4d516233b4bf..aaa04f52f7794 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -73,6 +73,9 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, tokenizer=model_info.tokenizer, tokenizer_mode=model_info.tokenizer_mode, revision=model_info.revision, + enforce_eager=model_info.enforce_eager, + skip_tokenizer_init=model_info.skip_tokenizer_init, + dtype=model_info.dtype, speculative_config={ "model": model_info.speculative_model, "num_speculative_tokens": 1, @@ -85,7 +88,7 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, model_impl=ModelImpl.TRANSFORMERS if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM, hf_overrides=hf_overrides_fn, - ) + max_num_seqs=model_info.max_num_seqs) @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py new file mode 100644 index 0000000000000..bfa54280dc02d --- /dev/null +++ b/tests/models/test_terratorch.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch + +from tests.conftest import VllmRunner +from vllm.utils import set_default_torch_num_threads + + +@pytest.mark.parametrize( + "model", + [ + "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11", + "mgazz/Prithvi_v2_eo_300_tl_unet_agb" + ], +) +def test_inference( + vllm_runner: type[VllmRunner], + model: str, +) -> None: + + pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16) + location_coords = torch.full((1, 2), 1.0, dtype=torch.float16) + prompt = dict(prompt_token_ids=[1], + multi_modal_data=dict(pixel_values=pixel_values, + location_coords=location_coords)) + with ( + set_default_torch_num_threads(1), + vllm_runner( + model, + runner="pooling", + dtype=torch.float16, + enforce_eager=True, + skip_tokenizer_init=True, + # Limit the maximum number of sequences to avoid the + # test going OOM during the warmup run + max_num_seqs=32, + ) as vllm_model, + ): + + vllm_output = vllm_model.llm.encode(prompt) + assert torch.equal( + torch.isnan(vllm_output[0].outputs.data).any(), + torch.tensor(False)) diff --git a/tests/models/utils.py b/tests/models/utils.py index 40a41afff8287..ab0b27af4d697 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -294,6 +294,8 @@ def build_model_context( limit_mm_per_prompt=limit_mm_per_prompt, mm_processor_cache_gb=mm_processor_cache_gb, hf_overrides=model_info.hf_overrides, + skip_tokenizer_init=model_info.skip_tokenizer_init, + enforce_eager=model_info.enforce_eager, **model_config_kwargs, ) return InputContext(model_config) diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py index b2fbef2ee25cb..825165e89b33c 100644 --- a/tests/plugins_tests/test_io_processor_plugins.py +++ b/tests/plugins_tests/test_io_processor_plugins.py @@ -7,12 +7,11 @@ import requests from tests.utils import RemoteOpenAIServer from vllm.config import VllmConfig -from vllm.entrypoints.llm import LLM from vllm.entrypoints.openai.protocol import IOProcessorResponse from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams -MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM" +MODEL_NAME = "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11" image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff" # noqa: E501 @@ -23,61 +22,7 @@ def test_loading_missing_plugin(): get_io_processor(vllm_config, "wrong_plugin") -def test_loading_engine_with_wrong_plugin(): - - with pytest.raises(ValueError): - LLM( - model=MODEL_NAME, - skip_tokenizer_init=True, - trust_remote_code=True, - enforce_eager=True, - # Limit the maximum number of parallel requests - # to avoid the model going OOM in CI. - max_num_seqs=32, - io_processor_plugin="wrong_plugin", - ) - - -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str): - - img_prompt = dict( - data=image_url, - data_format="url", - image_format="tiff", - out_data_format="b64_json", - ) - - pooling_params = PoolingParams(task="encode", softmax=False) - - with vllm_runner( - model_name, - runner="pooling", - skip_tokenizer_init=True, - trust_remote_code=True, - enforce_eager=True, - # Limit the maximum number of parallel requests - # to avoid the model going OOM in CI. - max_num_seqs=1, - io_processor_plugin="prithvi_to_tiff_valencia", - ) as llm_runner: - pooler_output = llm_runner.get_llm().encode( - img_prompt, - pooling_params=pooling_params, - ) - output = pooler_output[0].outputs - - # verify the output is formatted as expected for this plugin - assert all( - hasattr(output, attr) - for attr in ["type", "format", "data", "request_id"]) - - # We just check that the output is a valid base64 string. - # Raises an exception and fails the test if the string is corrupted. - base64.b64decode(output.data) - - -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def server(): args = [ "--runner", @@ -90,7 +35,9 @@ def server(): "--max-num-seqs", "32", "--io-processor-plugin", - "prithvi_to_tiff_valencia" + "prithvi_to_tiff_valencia", + "--model-impl", + "terratorch", ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -136,3 +83,43 @@ async def test_prithvi_mae_plugin_online( # We just check that the output is a valid base64 string. # Raises an exception and fails the test if the string is corrupted. base64.b64decode(plugin_data["data"]) + + +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str): + + img_prompt = dict( + data=image_url, + data_format="url", + image_format="tiff", + out_data_format="b64_json", + ) + + pooling_params = PoolingParams(task="encode", softmax=False) + + with vllm_runner( + model_name, + runner="pooling", + skip_tokenizer_init=True, + trust_remote_code=True, + enforce_eager=True, + # Limit the maximum number of parallel requests + # to avoid the model going OOM in CI. + max_num_seqs=1, + model_impl="terratorch", + io_processor_plugin="prithvi_to_tiff_valencia", + ) as llm_runner: + pooler_output = llm_runner.get_llm().encode( + img_prompt, + pooling_params=pooling_params, + ) + output = pooler_output[0].outputs + + # verify the output is formatted as expected for this plugin + assert all( + hasattr(output, attr) + for attr in ["type", "format", "data", "request_id"]) + + # We just check that the output is a valid base64 string. + # Raises an exception and fails the test if the string is corrupted. + base64.b64decode(output.data) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 2cea2695a66e5..7c2b497022658 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -171,6 +171,7 @@ class ModelImpl(str, enum.Enum): AUTO = "auto" VLLM = "vllm" TRANSFORMERS = "transformers" + TERRATORCH = "terratorch" def get_attr_docs(cls: type[Any]) -> dict[str, str]: @@ -496,7 +497,9 @@ class ModelConfig: back to the Transformers implementation if no vLLM implementation is available.\n - "vllm" will use the vLLM model implementation.\n - - "transformers" will use the Transformers model implementation.""" + - "transformers" will use the Transformers model implementation.\n + - "terratorch" will use the TerraTorch model implementation. + """ override_attention_dtype: Optional[str] = None """Override dtype for attention""" logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index feca60f2f001e..38d300b03d2c4 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -184,10 +184,11 @@ _EMBEDDING_MODELS = { "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 - # Technically PrithviGeoSpatialMAE is a model that works on images, both in - # input and output. I am adding it here because it piggybacks on embedding + # Technically Terratorch models work on images, both in + # input and output. I am adding it here because it piggy-backs on embedding # models for the time being. - "PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"), + "PrithviGeoSpatialMAE": ("terratorch", "Terratorch"), + "Terratorch": ("terratorch", "Terratorch"), } _CROSS_ENCODER_MODELS = { @@ -639,6 +640,9 @@ class _ModelRegistry: model_info = self._try_inspect_model_cls(arch) if model_info is not None: return (model_info, arch) + elif model_config.model_impl == ModelImpl.TERRATORCH: + model_info = self._try_inspect_model_cls("Terratorch") + return (model_info, "Terratorch") # Fallback to transformers impl (after resolving convert_type) if (all(arch not in self.models for arch in architectures) @@ -687,6 +691,11 @@ class _ModelRegistry: model_cls = self._try_load_model_cls(arch) if model_cls is not None: return (model_cls, arch) + elif model_config.model_impl == ModelImpl.TERRATORCH: + arch = "Terratorch" + model_cls = self._try_load_model_cls(arch) + if model_cls is not None: + return (model_cls, arch) # Fallback to transformers impl (after resolving convert_type) if (all(arch not in self.models for arch in architectures) diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/terratorch.py similarity index 52% rename from vllm/model_executor/models/prithvi_geospatial_mae.py rename to vllm/model_executor/models/terratorch.py index 2edc357d2df1b..739396a4932cb 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/terratorch.py @@ -15,13 +15,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Inference-only IBM/NASA Prithvi Geospatial model.""" +"""Wrapper around `Terratorch` models""" +from collections import OrderedDict from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Optional, Union +from typing import Any, Callable, Optional, Union import torch import torch.nn as nn +from terratorch.vllm import (DummyDataGenerator, InferenceRunner, + InputDefinition, InputTypeEnum) from transformers import BatchFeature from vllm.config import VllmConfig @@ -29,6 +32,7 @@ from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import AutoWeightsLoader from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, MultiModalInputs, MultiModalKwargsItems, @@ -45,52 +49,46 @@ from .interfaces import (IsAttentionFree, MultiModalEmbeddings, from .interfaces_base import default_pooling_type -def _prithvi_field_config(hf_inputs: Mapping[str, torch.Tensor]): - # This model receives in input a multi-dimensional tensor representing - # a single image patch and therefore it is not to be split - # into multiple elements, but rather to be considered a single one. - # Hence, the decision of using a MultiModalSharedField. - # The expected shape is (num_channels, width, height). - - # This model however allows the user to also submit multiple image - # patches as a batch, adding a further dimension to the above shape. - # At this stage we only support submitting one patch per request and - # batching is achieved via vLLM batching. - # TODO (christian-pinto): enable support for multi patch requests - # in tandem with vLLM batching. - return dict( - pixel_values=MultiModalFieldConfig.shared(batch_size=1, - modality="image"), - location_coords=MultiModalFieldConfig.shared(batch_size=1, - modality="image"), - ) +def _terratorch_field_names(pretrained_cfg: dict): + input_definition = InputDefinition(**pretrained_cfg["input"]) + return set(input_definition.data.keys()) -class PrithviGeoSpatialMAEMultiModalDataParser(MultiModalDataParser): +def _terratorch_field_factory( + pretrained_cfg: dict +) -> Callable[ + [Mapping[str, torch.Tensor]], + Mapping[str, MultiModalFieldConfig], +]: - def _parse_image_data( - self, - data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]], - ) -> Optional[ModalityDataItems[Any, Any]]: - if isinstance(data, dict): - return DictEmbeddingItems( - data, - modality="image", - required_fields={"pixel_values", "location_coords"}, - fields_factory=_prithvi_field_config, - ) + def _terratorch_field_config(hf_inputs: Mapping[str, torch.Tensor]): + input_definition = InputDefinition(**pretrained_cfg["input"]) + fields = {} + for input_name, input in input_definition.data.items(): + if input.type == InputTypeEnum.tensor: + fields[input_name] = "image" - return super()._parse_image_data(data) + mm_fields_config = {} + for field_name, field_modality in fields.items(): + mm_fields_config[field_name] = MultiModalFieldConfig.shared( + batch_size=1, modality=field_modality) + return mm_fields_config + + return _terratorch_field_config -class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo): +class TerratorchProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} -class PrithviGeoSpatialMAEInputBuilder( - BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]): +class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]): + + def __init__(self, info: TerratorchProcessingInfo): + super().__init__(info) + self.dummy_data_generator = DummyDataGenerator( + self.info.get_hf_config().to_dict()["pretrained_cfg"]) def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: return "" @@ -100,29 +98,57 @@ class PrithviGeoSpatialMAEInputBuilder( seq_len: int, mm_counts: Mapping[str, int], ) -> MultiModalDataDict: - # This model input is fixed and is in the form of a torch Tensor. - # The size of pixel_values might change in the cases where we resize - # the input but never exceeds the dimensions below. - image_data = { - "pixel_values": torch.full((6, 512, 512), 1.0, - dtype=torch.float16), - "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16), - } - - return {"image": image_data} + # Dummy data is generated based on the 'input' section + # defined in the HF configuration file + return self.dummy_data_generator.get_dummy_mm_data() -class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor): +class TerratorchMultiModalDataParser(MultiModalDataParser): + + def __init__(self, pretrained_cfg: dict, *args, **kwargs): + self._pretrained_cfg = pretrained_cfg + super().__init__(*args, **kwargs) + + def _parse_image_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]], + ) -> Optional[ModalityDataItems[Any, Any]]: + if isinstance(data, dict): + + terratorch_fields = _terratorch_field_names(self._pretrained_cfg) + + return DictEmbeddingItems( + data, + modality="image", + required_fields=terratorch_fields, + fields_factory=_terratorch_field_factory(self._pretrained_cfg), + ) + + return super()._parse_image_data(data) + + +class TerratorchMultiModalProcessor(BaseMultiModalProcessor): + + def __init__( + self, + info: TerratorchProcessingInfo, + dummy_inputs: "BaseDummyInputsBuilder[TerratorchProcessingInfo]", + *, + cache: Optional[MultiModalProcessorOnlyCache] = None) -> None: + + self.pretrained_cfg = info.get_hf_config().to_dict()["pretrained_cfg"] + super().__init__(info=info, dummy_inputs=dummy_inputs, cache=cache) def _get_data_parser(self) -> MultiModalDataParser: - return PrithviGeoSpatialMAEMultiModalDataParser() + return TerratorchMultiModalDataParser( + pretrained_cfg=self.pretrained_cfg) def _get_mm_fields_config( self, hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - return _prithvi_field_config(hf_inputs) + return _terratorch_field_factory(self.pretrained_cfg)(hf_inputs) def _get_prompt_updates( self, @@ -173,13 +199,11 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor): @default_pooling_type("All") @MULTIMODAL_REGISTRY.register_processor( - PrithviGeoSpatialMAEMultiModalProcessor, - info=PrithviGeoSpatialMAEProcessingInfo, - dummy_inputs=PrithviGeoSpatialMAEInputBuilder, + TerratorchMultiModalProcessor, + info=TerratorchProcessingInfo, + dummy_inputs=TerratorchInputBuilder, ) -class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal): - """Prithvi Masked Autoencoder""" - +class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal): supports_multimodal_raw_input_only = True is_pooling_model = True @@ -190,43 +214,13 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal): raise ValueError("Only image modality is supported") - def _instantiate_model(self, config: dict) -> Optional[nn.Module]: - # We might be able/need to support different tasks with this same model - if config["task_args"]["task"] == "SemanticSegmentationTask": - from terratorch.cli_tools import SemanticSegmentationTask - - task = SemanticSegmentationTask( - config["model_args"], - config["task_args"]["model_factory"], - loss=config["task_args"]["loss"], - lr=config["task_args"]["lr"], - ignore_index=config["task_args"]["ignore_index"], - optimizer=config["task_args"]["optimizer"], - optimizer_hparams=config["optimizer_params"], - scheduler=config["task_args"]["scheduler"], - scheduler_hparams=config["scheduler_params"], - plot_on_val=config["task_args"]["plot_on_val"], - freeze_decoder=config["task_args"]["freeze_decoder"], - freeze_backbone=config["task_args"]["freeze_backbone"], - ) - - return task.model - else: - return None - def __init__(self, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - # the actual model is dynamically instantiated using terratorch - # allowing us to perform changes to the model architecture - # at startup time (e.g., change the model decoder class.) - self.model = self._instantiate_model( - vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"]) - if self.model is None: - raise ValueError( - "Unsupported task. " - "Only SemanticSegmentationTask is supported for now " - "by PrithviGeospatialMAE.") + config = vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"] + + self.inference_runner = InferenceRunner(config) + self.model = self.inference_runner.model pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None @@ -234,23 +228,6 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal): self.pooler = DispatchPooler( {"encode": Pooler.for_encode(pooler_config)}, ) - def _parse_and_validate_multimodal_data( - self, **kwargs) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - pixel_values = kwargs.pop("pixel_values", None) - if not isinstance(pixel_values, torch.Tensor): - raise ValueError(f"Incorrect type of pixel_values. " - f"Got type: {type(pixel_values)}") - - location_coords = kwargs.pop("location_coords", None) - if not isinstance(location_coords, torch.Tensor): - raise ValueError(f"Incorrect type of location_coords. " - f"Got type: {type(location_coords)}") - location_coords = torch.unbind(location_coords, dim=0)[0] - if location_coords.shape == torch.Size([0]): - location_coords = None - - return pixel_values, location_coords - def get_input_embeddings( self, input_ids: torch.Tensor, @@ -270,10 +247,7 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal): inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ): - pixel_values, location_coords = ( - self._parse_and_validate_multimodal_data(**kwargs)) - model_output = self.model(pixel_values, - location_coords=location_coords) + model_output = self.inference_runner.forward(**kwargs) return model_output.output @@ -283,28 +257,34 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal): model_buffers = dict(self.named_buffers()) loaded_buffers = [] for key, value in weights: - if key == "state_dict": - weights_to_parse = value - for name, weight in weights_to_parse.items(): - if "pos_embed" in name: - continue + if isinstance(value, (dict, OrderedDict)): + if key == "state_dict": + weights_to_parse = value + for name, weight in weights_to_parse.items(): + name = f"inference_runner.{name}" - if "_timm_module." in name: - name = name.replace("_timm_module.", "") + if "pos_embed" in name: + continue - # this model requires a couple of buffers to be loaded - # that are not loadable with the AutoWeightsLoader - if name in model_buffers: if "_timm_module." in name: name = name.replace("_timm_module.", "") - buffer = model_buffers[name] - weight_loader = getattr(buffer, "weight_loader", - default_weight_loader) - weight_loader(buffer, weight) - loaded_buffers.append(name) - else: - params_list.append((name, weight)) - break + + # this model requires a couple of buffers to be loaded + # that are not loadable with the AutoWeightsLoader + if name in model_buffers: + if "_timm_module." in name: + name = name.replace("_timm_module.", "") + buffer = model_buffers[name] + weight_loader = getattr(buffer, "weight_loader", + default_weight_loader) + weight_loader(buffer, weight) + loaded_buffers.append(name) + else: + params_list.append((name, weight)) + break + + elif isinstance(value, torch.Tensor): + params_list.append((f"inference_runner.model.{key}", value)) # Load the remaining model parameters loader = AutoWeightsLoader(self) From 04f3c35cff1a3ea240d3977842b23ae5e853eb9f Mon Sep 17 00:00:00 2001 From: anthonsu <50185138+anthonsu@users.noreply.github.com> Date: Thu, 4 Sep 2025 02:41:41 -0700 Subject: [PATCH 75/95] Improve flexibility of auto_tune.sh execution. (#23766) Signed-off-by: Anthony Su <50185138+anthonsu@users.noreply.github.com> Signed-off-by: anthonsu <50185138+anthonsu@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- benchmarks/auto_tune/README.md | 6 +++++ benchmarks/auto_tune/auto_tune.sh | 44 +++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md index 9aad51df6e003..3aa988aac2548 100644 --- a/benchmarks/auto_tune/README.md +++ b/benchmarks/auto_tune/README.md @@ -31,6 +31,12 @@ cd vllm You must set the following variables at the top of the script before execution. + Note: You can also override the default values below via environment variables when running the script. + +```bash +MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh +``` + | Variable | Description | Example Value | | --- | --- | --- | | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` | diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index d9d0fe4e0ccba..ed3679b66f805 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -5,25 +5,41 @@ TAG=$(date +"%Y_%m_%d_%H_%M") SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -BASE="$SCRIPT_DIR/../../.." -MODEL="meta-llama/Llama-3.1-8B-Instruct" -SYSTEM="TPU" -TP=1 -DOWNLOAD_DIR="" -INPUT_LEN=4000 -OUTPUT_LEN=16 -MAX_MODEL_LEN=4096 -MIN_CACHE_HIT_PCT=0 -MAX_LATENCY_ALLOWED_MS=100000000000 -NUM_SEQS_LIST="128 256" -NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096" +VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO} +BASE=${BASE:-"$SCRIPT_DIR/../../.."} +MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"} +SYSTEM=${SYSTEM:-"TPU"} +TP=${TP:-1} +DOWNLOAD_DIR=${DOWNLOAD_DIR:-""} +INPUT_LEN=${INPUT_LEN:-4000} +OUTPUT_LEN=${OUTPUT_LEN:-16} +MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096} +MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0} +MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000} +NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"} +NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"} LOG_FOLDER="$BASE/auto-benchmark/$TAG" RESULT="$LOG_FOLDER/result.txt" PROFILE_PATH="$LOG_FOLDER/profile" -echo "result file: $RESULT" -echo "model: $MODEL" +echo "====================== AUTO TUNE PARAMETERS ====================" +echo "SCRIPT_DIR=$SCRIPT_DIR" +echo "BASE=$BASE" +echo "MODEL=$MODEL" +echo "SYSTEM=$SYSTEM" +echo "TP=$TP" +echo "DOWNLOAD_DIR=$DOWNLOAD_DIR" +echo "INPUT_LEN=$INPUT_LEN" +echo "OUTPUT_LEN=$OUTPUT_LEN" +echo "MAX_MODEL_LEN=$MAX_MODEL_LEN" +echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT" +echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS" +echo "NUM_SEQS_LIST=$NUM_SEQS_LIST" +echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST" +echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL" +echo "RESULT_FILE=$RESULT" +echo "====================== AUTO TUNEPARAMETERS ====================" rm -rf $LOG_FOLDER rm -rf $PROFILE_PATH From 3efb9f4d95bfe8810b106637cd9eb45693c93e4d Mon Sep 17 00:00:00 2001 From: whx <56632993+whx-sjtu@users.noreply.github.com> Date: Thu, 4 Sep 2025 17:46:37 +0800 Subject: [PATCH 76/95] [Attention][Platform] Refactor MLA to support Custom Op (#23332) Signed-off-by: whx-sjtu <2952154980@qq.com> --- vllm/model_executor/layers/mla.py | 158 ++++++++++++++++++++++ vllm/model_executor/models/deepseek_v2.py | 86 ++++-------- 2 files changed, 186 insertions(+), 58 deletions(-) create mode 100644 vllm/model_executor/layers/mla.py diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py new file mode 100644 index 0000000000000..a05716190365f --- /dev/null +++ b/vllm/model_executor/layers/mla.py @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass +from typing import Optional + +import torch + +from vllm.attention import Attention +from vllm.config import CacheConfig +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.quantization import QuantizationConfig + + +@dataclass +class MLAModules: + """Modules used in MLA. + """ + kv_a_layernorm: torch.nn.Module + kv_b_proj: torch.nn.Module + rotary_emb: torch.nn.Module + o_proj: torch.nn.Module + fused_qkv_a_proj: Optional[torch.nn.Module] + kv_a_proj_with_mqa: Optional[torch.nn.Module] + q_a_layernorm: Optional[torch.nn.Module] + q_b_proj: Optional[torch.nn.Module] + q_proj: Optional[torch.nn.Module] + + +@CustomOp.register("multi_head_latent_attention") +class MultiHeadLatentAttention(CustomOp): + """MLA layer registered as CustomOp. + Note that currently MLA ignores the enable/disable mechanism of CustomOp + because there is only one in-tree implementation in forward_native. + TODO: implement this with a new PluggableLayer mechanism. + + This class takes positions and hidden_states as input. + The input tensors can either contain prefill tokens or decode tokens. + The class does the following: + + 1. MLA Preprocess. + 2. Perform multi-head attention to prefill tokens and + multi-query attention to decode tokens separately. + 3. Return the output tensor. + """ + + def __init__( + self, + hidden_size: int, + num_heads: int, + scale: float, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: Optional[int], + kv_lora_rank: int, + mla_modules: MLAModules, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim + self.v_head_dim = v_head_dim + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.num_heads = num_heads + self.fused_qkv_a_proj = mla_modules.fused_qkv_a_proj + self.kv_a_proj_with_mqa = mla_modules.kv_a_proj_with_mqa + self.q_a_layernorm = mla_modules.q_a_layernorm + self.q_b_proj = mla_modules.q_b_proj + self.q_proj = mla_modules.q_proj + self.kv_a_layernorm = mla_modules.kv_a_layernorm + self.kv_b_proj = mla_modules.kv_b_proj + self.rotary_emb = mla_modules.rotary_emb + self.o_proj = mla_modules.o_proj + + # In the MLA backend, kv_cache includes both k_c and + # pe (i.e. decoupled position embeddings). In particular, + # the concat_and_cache_mla op requires + # k_c.size(1) + k_pe.size(1) == kv_cache.size(2) + # i.e. + # kv_lora_rank + qk_rope_head_dim == head_size + self.mla_attn = Attention( + num_heads=self.num_heads, + head_size=self.kv_lora_rank + self.qk_rope_head_dim, + scale=scale, + num_kv_heads=1, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + use_mla=True, + # MLA Args + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + qk_head_dim=self.qk_head_dim, + v_head_dim=self.v_head_dim, + kv_b_proj=self.kv_b_proj, + ) + + self.prefix = prefix + self.debug_layer_idx = int(self.prefix.split(".")[-2]) + + def forward_native( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + q_c = None + kv_lora = None + + if self.q_lora_rank is not None: + assert self.fused_qkv_a_proj is not None, \ + "fused_qkv_a_proj is required when q_lora_rank is not None" + assert self.q_a_layernorm is not None, \ + "q_a_layernorm is required when q_lora_rank is not None" + assert self.q_b_proj is not None, \ + "q_b_proj is required when q_lora_rank is not None" + qkv_lora = self.fused_qkv_a_proj(hidden_states)[0] + q_c, kv_lora = qkv_lora.split( + [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim], + dim=-1, + ) + q_c = self.q_a_layernorm(q_c) + q = self.q_b_proj(q_c)[0] + else: + assert self.kv_a_proj_with_mqa is not None, \ + "kv_a_proj_with_mqa is required when q_lora_rank is None" + assert self.q_proj is not None, \ + "q_proj is required when q_lora_rank is None" + kv_lora = self.kv_a_proj_with_mqa(hidden_states)[0] + q = self.q_proj(hidden_states)[0] + + kv_c, k_pe = kv_lora.split([self.kv_lora_rank, self.qk_rope_head_dim], + dim=-1) + kv_c_normed = self.kv_a_layernorm(kv_c) + + q = q.view(-1, self.num_heads, self.qk_head_dim) + # Add head dim of 1 to k_pe + k_pe = k_pe.unsqueeze(1) + + q[..., self.qk_nope_head_dim:], k_pe = self.rotary_emb( + positions, q[..., self.qk_nope_head_dim:], k_pe) + + attn_out = self.mla_attn( + q, + kv_c_normed, + k_pe, + output_shape=(hidden_states.shape[0], + self.num_heads * self.v_head_dim)) + return self.o_proj(attn_out)[0] + + def forward_cuda(self, *args, **kwargs): + return self.forward_native(*args, **kwargs) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 7db6fc5d8ad89..bb95a1dbf122e 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -47,6 +47,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mla import MLAModules, MultiHeadLatentAttention from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.shared_fused_moe import SharedFusedMoE @@ -492,72 +493,41 @@ class DeepseekV2MLAAttention(nn.Module): mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale - # In the MLA backend, kv_cache includes both k_c and - # pe (i.e. decoupled position embeddings). In particular, - # the concat_and_cache_mla op requires - # k_c.size(1) + k_pe.size(1) == kv_cache.size(2) - # i.e. - # kv_lora_rank + qk_rope_head_dim == head_size - self.mla_attn = Attention( - num_heads=self.num_local_heads, - head_size=self.kv_lora_rank + self.qk_rope_head_dim, - scale=self.scaling, - num_kv_heads=1, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - use_mla=True, - # MLA Args - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - qk_head_dim=self.qk_head_dim, - v_head_dim=self.v_head_dim, + mla_modules = MLAModules( + kv_a_layernorm=self.kv_a_layernorm, kv_b_proj=self.kv_b_proj, + rotary_emb=self.rotary_emb, + o_proj=self.o_proj, + fused_qkv_a_proj=self.fused_qkv_a_proj + if self.q_lora_rank is not None else None, + kv_a_proj_with_mqa=self.kv_a_proj_with_mqa + if self.q_lora_rank is None else None, + q_a_layernorm=self.q_a_layernorm + if self.q_lora_rank is not None else None, + q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None, + q_proj=self.q_proj if self.q_lora_rank is None else None, + ) + self.mla_attn = MultiHeadLatentAttention( + self.hidden_size, + self.num_local_heads, + self.scaling, + self.qk_nope_head_dim, + self.qk_rope_head_dim, + self.v_head_dim, + self.q_lora_rank, + self.kv_lora_rank, + mla_modules, + cache_config, + quant_config, + prefix, ) - - self.prefix = prefix - self.debug_layer_idx = int(self.prefix.split(".")[-2]) def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, ) -> torch.Tensor: - q_c = None - kv_lora = None - - if self.q_lora_rank is not None: - qkv_lora = self.fused_qkv_a_proj(hidden_states)[0] - q_c, kv_lora = qkv_lora.split( - [self.q_lora_rank, self.kv_lora_rank + self.qk_rope_head_dim], - dim=-1, - ) - q_c = self.q_a_layernorm(q_c) - q = self.q_b_proj(q_c)[0] - else: - kv_lora = self.kv_a_proj_with_mqa(hidden_states)[0] - q = self.q_proj(hidden_states)[0] - - kv_c, k_pe = kv_lora.split([self.kv_lora_rank, self.qk_rope_head_dim], - dim=-1) - kv_c_normed = self.kv_a_layernorm(kv_c) - - q = q.view(-1, self.num_local_heads, self.qk_head_dim) - # Add head dim of 1 to k_pe - k_pe = k_pe.unsqueeze(1) - - q[..., self.qk_nope_head_dim:], k_pe = self.rotary_emb( - positions, q[..., self.qk_nope_head_dim:], k_pe) - - attn_out = self.mla_attn( - q, - kv_c_normed, - k_pe, - output_shape=(hidden_states.shape[0], - self.num_local_heads * self.v_head_dim)) - return self.o_proj(attn_out)[0] + return self.mla_attn(positions, hidden_states) class DeepseekV2DecoderLayer(nn.Module): From 2c301ee2eb2d60015936c5e34c80fa62d3c2d37d Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Thu, 4 Sep 2025 17:47:08 +0800 Subject: [PATCH 77/95] [Bugfix] Fix Incremental Detokenization with `tokenizers == 0.22.0` (#24159) Signed-off-by: Fanli Lin Signed-off-by: Fanli Lin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/v1/engine/detokenizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 04ad51aae0a8c..0ccbe65493499 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -234,7 +234,7 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer): try: token = self.stream.step(self.tokenizer, next_token_id) except Exception as e: - if str(e) != INVALID_PREFIX_ERR_MSG: + if not str(e).startswith(INVALID_PREFIX_ERR_MSG): raise e # Recover from edge case where tokenizer can produce non-monotonic, # invalid UTF-8 output, which breaks the internal state of @@ -243,7 +243,8 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer): logger.warning( "Encountered invalid prefix detokenization error" " for request %s, resetting decode stream.", self.request_id) - self.stream = DecodeStream(self.skip_special_tokens) + self.stream = DecodeStream( + skip_special_tokens=self.skip_special_tokens) token = self.stream.step(self.tokenizer, next_token_id) return token From 402759d4727d9a377598a09d06770770d4e184c6 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 4 Sep 2025 05:47:59 -0400 Subject: [PATCH 78/95] [Attention] FlashAttn MLA (#14258) Signed-off-by: Lucas Wilkinson Signed-off-by: Lucas Wilkinson Signed-off-by: Matthew Bonanni Co-authored-by: Matthew Bonanni Co-authored-by: Matthew Bonanni --- .buildkite/check-wheel-size.py | 8 +- cmake/external_projects/vllm_flash_attn.cmake | 2 +- docker/Dockerfile | 2 +- .../attention/test_attention_selector.py | 106 +++++++--- tests/v1/attention/test_attention_backends.py | 16 -- tests/v1/attention/test_mla_backends.py | 184 ++++++++--------- tests/v1/attention/utils.py | 2 + vllm/attention/utils/fa_utils.py | 13 ++ vllm/engine/arg_utils.py | 2 + vllm/envs.py | 1 + vllm/platforms/cuda.py | 68 ++++--- vllm/platforms/interface.py | 5 +- vllm/v1/attention/backends/flashinfer.py | 3 +- vllm/v1/attention/backends/linear_attn.py | 5 +- vllm/v1/attention/backends/mamba1_attn.py | 5 +- vllm/v1/attention/backends/mamba2_attn.py | 5 +- vllm/v1/attention/backends/mla/common.py | 17 +- .../attention/backends/mla/flashattn_mla.py | 189 ++++++++++++++++++ vllm/v1/attention/backends/mla/flashmla.py | 10 +- .../attention/backends/mla/rocm_aiter_mla.py | 14 +- vllm/v1/attention/backends/short_conv_attn.py | 7 +- vllm/v1/attention/backends/xformers.py | 16 +- 22 files changed, 480 insertions(+), 200 deletions(-) create mode 100644 vllm/v1/attention/backends/mla/flashattn_mla.py diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 68aff793ae6aa..76f6d7aeca0d8 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -5,11 +5,11 @@ import os import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB -# Note that we have 400 MiB quota, please use it wisely. -# See https://github.com/pypi/support/issues/3792 . +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB +# Note that we have 800 MiB quota, please use it wisely. +# See https://github.com/pypi/support/issues/6326 . # Please also sync the value with the one in Dockerfile. -VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400)) +VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450)) def print_top_10_largest_files(zip_file): diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 49defccbb1fa4..3d32121f13ac2 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f + GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/docker/Dockerfile b/docker/Dockerfile index 01b7aa0f44afd..6f8ca30ffd31b 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -237,7 +237,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py -ARG VLLM_MAX_SIZE_MB=400 +ARG VLLM_MAX_SIZE_MB=450 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ARG RUN_WHEEL_CHECK=true RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index aea166da3af2f..3c2aaabacae8c 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -22,7 +22,7 @@ def clear_cache(): # Define MLA and non-MLA backends separately DEVICE_MLA_BACKENDS = { - "cuda": ["TRITON_MLA", "FLASHMLA"], + "cuda": ["TRITON_MLA", "FLASHMLA", "FLASH_ATTN_MLA", "CUTLASS_MLA"], "hip": ["TRITON_MLA", "ROCM_AITER_MLA"], "cpu": [], } @@ -98,21 +98,14 @@ def test_env( with patch("vllm.attention.selector.current_platform", RocmPlatform()): if use_mla: - # Validate HIP MLA backend-block_size combinations - valid_combination = ( - (name == "TRITON_MLA" and block_size != 1) - or (name == "ROCM_AITER_MLA" and block_size == 1)) + # ROCm MLA backend logic: + # - TRITON_MLA: supported when block_size != 1 + # - ROCM_AITER_MLA: supported when block_size == 1 + # If backend is forced but doesn't match block_size, + # should raise ValueError - if valid_combination: - backend = get_attn_backend(16, - torch.float16, - torch.float16, - block_size, - False, - use_mla=use_mla) - expected = f"{name}_VLLM_V1" if use_v1 else name - assert backend.get_name() == expected - else: + if name == "TRITON_MLA" and block_size == 1: + # TRITON_MLA doesn't support block_size == 1 with pytest.raises(ValueError) as exc_info: get_attn_backend(16, torch.float16, @@ -122,6 +115,27 @@ def test_env( use_mla=use_mla) assert f"The selected backend, {name}" in str( exc_info.value) + elif name == "ROCM_AITER_MLA" and block_size != 1: + # ROCM_AITER_MLA only supports block_size == 1 + with pytest.raises(ValueError) as exc_info: + get_attn_backend(16, + torch.float16, + torch.float16, + block_size, + False, + use_mla=use_mla) + assert f"The selected backend, {name}" in str( + exc_info.value) + else: + # Valid backend-block_size combination + backend = get_attn_backend(16, + torch.float16, + torch.float16, + block_size, + False, + use_mla=use_mla) + expected = f"{name}_VLLM_V1" if use_v1 else name + assert backend.get_name() == expected else: backend = get_attn_backend(16, torch.float16, @@ -136,16 +150,22 @@ def test_env( with patch("vllm.attention.selector.current_platform", CudaPlatform()): if use_mla: - if name == "FLASHMLA" and block_size == 64: - from vllm.attention.backends.flashmla import ( - is_flashmla_supported) + # CUDA MLA backend logic: + # - CUTLASS_MLA: only supported with block_size == 128 + # and Blackwell GPUs (SM 10.0), V1 only + # - FLASHMLA: only supported with block_size == 64 + # - FLASH_ATTN_MLA: V1 only + # - TRITON_MLA: fallback for other cases - # only on cuda platforms with specific capability. - is_supported, _ = is_flashmla_supported() - - if not is_supported: - # if platform is not supported then skip this case. - pytest.skip() + if name == "CUTLASS_MLA": + if not use_v1: + # CUTLASS_MLA only supported on V1 engine + pytest.skip( + "CUTLASS_MLA only supported on V1 engine") + elif block_size != 128: + # CUTLASS_MLA only supports block_size == 128 + pytest.skip( + "CUTLASS_MLA only supports block_size 128") else: backend = get_attn_backend(16, torch.float16, @@ -153,9 +173,45 @@ def test_env( block_size, False, use_mla=use_mla) - expected = f"{name}_VLLM_V1" if use_v1 else name + expected = "CUTLASS_MLA_VLLM_V1" + assert backend.get_name() == expected + elif name == "FLASHMLA": + if block_size != 64: + # FlashMLA only supports block_size == 64 + pytest.skip("FlashMLA only supports block_size 64") + else: + from vllm.attention.backends.flashmla import ( + is_flashmla_supported) + is_supported, _ = is_flashmla_supported() + if not is_supported: + pytest.skip( + "FlashMLA not supported on this platform") + else: + backend = get_attn_backend(16, + torch.float16, + torch.float16, + block_size, + False, + use_mla=use_mla) + expected = f"{name}_VLLM_V1" if use_v1 else name + assert backend.get_name() == expected + elif name == "FLASH_ATTN_MLA": + if not use_v1: + # FlashAttention MLA only supported on V1 engine + pytest.skip( + "FlashAttention MLA only supported on V1 engine" + ) + else: + backend = get_attn_backend(16, + torch.float16, + torch.float16, + block_size, + False, + use_mla=use_mla) + expected = "FLASH_ATTN_MLA" assert backend.get_name() == expected else: + # TRITON_MLA or other fallback backend = get_attn_backend(16, torch.float16, torch.float16, diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index e4c07aae0ebed..1ae8b91c347a2 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -70,22 +70,6 @@ BATCH_SPECS = { } -def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec, - device: torch.device, - num_blocks: int = 100) -> torch.Tensor: - """Create a dummy KV cache tensor for testing.""" - kv_cache = torch.randn( - 2, # K and V - num_blocks, - kv_cache_spec.block_size, - kv_cache_spec.num_kv_heads, - kv_cache_spec.head_size, - dtype=_convert_dtype_to_torch(kv_cache_spec.dtype), - device=device, - ) - return kv_cache - - def create_and_prepopulate_kv_cache( k_contexts: list[torch.Tensor], v_contexts: list[torch.Tensor], diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 24070358799ef..e7cd116fdc834 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -15,7 +15,7 @@ from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import FullAttentionSpec BACKENDS_TO_TEST = [ - _Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1, + _Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1, _Backend.FLASH_ATTN_MLA, _Backend.TRITON_MLA_VLLM_V1 ] @@ -69,20 +69,6 @@ BATCH_SPECS = { } -def create_dummy_kv_cache(kv_cache_spec: FullAttentionSpec, - device: torch.device, - num_blocks: int = 100) -> torch.Tensor: - """Create a dummy KV cache tensor for testing.""" - kv_cache = torch.randn( - num_blocks, - kv_cache_spec.block_size, - kv_cache_spec.head_size, # latent dimension - dtype=_convert_dtype_to_torch(kv_cache_spec.dtype), - device=device, - ) - return kv_cache - - def create_and_prepopulate_kv_cache( kv_c_contexts: list[torch.Tensor], k_pe_contexts: list[torch.Tensor], @@ -315,7 +301,7 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): # 2. Generate data and compute SDPA reference output for MLA all_q_vllm, all_kv_c_vllm, all_k_pe_vllm = [], [], [] - all_sdpa_outputs = [] + all_sdpa_outputs: list[list[torch.Tensor]] = [] kv_c_contexts, k_pe_contexts = [], [] # Create shared MLA weight matrices for consistency across all sequences @@ -331,6 +317,9 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): device=device) kv_b_proj_weight = torch.cat([W_UK, W_UV], dim=-1) + for i, backend in enumerate(BACKENDS_TO_TEST): + all_sdpa_outputs.append([]) + for i in range(batch_size): s_len = seq_lens[i] q_len = query_lens[i] @@ -358,85 +347,93 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): dtype=dtype, device=device) - # Determine if this is decode (single token) - # or prefill (multiple tokens) - is_decode = q_len == 1 + # Determine if this is decode or prefill + is_decode = [] + for i, backend in enumerate(BACKENDS_TO_TEST): + builder_cls, _ = get_attention_backend(backend) + is_decode.append(q_len <= builder_cls.reorder_batch_threshold) # Split q into nope and rope components q_nope, q_pe = q_c.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1) - if is_decode: - # Decode path: MQA-style attention in latent space - # Transform q_nope to latent space: q_nope @ W_UK - # q_nope: [1, num_heads, qk_nope_head_dim] - # W_UK: [kv_lora_rank, num_heads, qk_nope_head_dim] - ql_nope = torch.einsum("qnh,lnh->qnl", q_nope, - W_UK) # [1, num_heads, kv_lora_rank] + ####################################################### + # Decode path: MQA-style attention in latent space + # Transform q_nope to latent space: q_nope @ W_UK + # q_nope: [1, num_heads, qk_nope_head_dim] + # W_UK: [kv_lora_rank, num_heads, qk_nope_head_dim] + ql_nope = torch.einsum("qnh,lnh->qnl", q_nope, + W_UK) # [1, num_heads, kv_lora_rank] - # Build MQA attention inputs - # Q: [1, num_heads, kv_lora_rank + qk_rope_head_dim] - q_mqa = torch.cat([ql_nope, q_pe], dim=-1) - # K: [s_len, kv_lora_rank + qk_rope_head_dim] - # (broadcasted to all heads) - k_mqa = torch.cat([kv_c_full, k_pe_full.squeeze(1)], dim=-1) - k_mqa = k_mqa.unsqueeze(1).expand(-1, num_q_heads, -1) - # V: [s_len, kv_lora_rank] (broadcasted to all heads) - v_mqa = kv_c_full.unsqueeze(1).expand(-1, num_q_heads, -1) + # Build MQA attention inputs + # Q: [1, num_heads, kv_lora_rank + qk_rope_head_dim] + q_mqa = torch.cat([ql_nope, q_pe], dim=-1) + # K: [s_len, kv_lora_rank + qk_rope_head_dim] + # (broadcasted to all heads) + k_mqa = torch.cat([kv_c_full, k_pe_full.squeeze(1)], dim=-1) + k_mqa = k_mqa.unsqueeze(1).expand(-1, num_q_heads, -1) + # V: [s_len, kv_lora_rank] (broadcasted to all heads) + v_mqa = kv_c_full.unsqueeze(1).expand(-1, num_q_heads, -1) - # SDPA expects (N, H, L, D) - q_sdpa_in = q_mqa.unsqueeze(0).transpose(1, 2) - k_sdpa_in = k_mqa.unsqueeze(0).transpose(1, 2) - v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2) + # Create custom attention mask for decode path: + # - Query tokens can attend to all context tokens + # - Query tokens can only attend to query tokens up to their position + attn_mask = torch.ones(q_len, s_len, dtype=torch.bool, device=device) + # Apply causal mask only to the query portion (context_len onwards) + causal_mask = torch.tril(torch.ones(q_len, q_len, device=device)) + attn_mask[:, context_len:] = causal_mask - sdpa_out_i = torch.nn.functional.scaled_dot_product_attention( - q_sdpa_in, k_sdpa_in, v_sdpa_in, is_causal=False, scale=scale) - sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze( - 0) # [1, num_heads, kv_lora_rank] + # SDPA expects (N, H, L, D) + q_sdpa_in = q_mqa.unsqueeze(0).transpose(1, 2) + k_sdpa_in = k_mqa.unsqueeze(0).transpose(1, 2) + v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2) - # Project back to output space: sdpa_out @ W_UV - sdpa_out_i = torch.einsum("qnl,lnv->qnv", sdpa_out_i, W_UV) - sdpa_out_i = sdpa_out_i.flatten(start_dim=-2) - else: - # Prefill path: MHA-style attention with full sequence - # Apply kv_b_proj to the full kv_c tensor - kv_nope_full = torch.einsum("sl,lnh->snh", kv_c_full, - kv_b_proj_weight) - k_nope_full, v_full = kv_nope_full.split( - [qk_nope_head_dim, v_head_dim], dim=-1) + sdpa_out_i_decode = torch.nn.functional.scaled_dot_product_attention( + q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale) + sdpa_out_i_decode = sdpa_out_i_decode.transpose(1, 2).squeeze( + 0) # [1, num_heads, kv_lora_rank] - # Build attention inputs for full sequence - q_mha = torch.cat([q_nope, q_pe], - dim=-1) # [q_len, num_heads, total_dim] - k_pe_full_expanded = k_pe_full.expand(-1, num_q_heads, -1) - k_full = torch.cat([k_nope_full, k_pe_full_expanded], dim=-1) + # Project back to output space: sdpa_out @ W_UV + sdpa_out_i_decode = torch.einsum("qnl,lnv->qnv", sdpa_out_i_decode, + W_UV) + sdpa_out_i_decode = sdpa_out_i_decode.flatten(start_dim=-2) - # Create custom attention mask: - # - Query tokens can attend to all context tokens - # - Query tokens can only attend to query tokens up to their pos - attn_mask = torch.ones(q_len, - s_len, - dtype=torch.bool, - device=device) - # Apply causal mask only to the query portion (context_len onwards) - causal_mask = torch.tril(torch.ones(q_len, q_len, device=device)) - attn_mask[:, context_len:] = causal_mask + ####################################################### + # Prefill path: MHA-style attention with full sequence + # Apply kv_b_proj to the full kv_c tensor + kv_nope_full = torch.einsum("sl,lnh->snh", kv_c_full, kv_b_proj_weight) + k_nope_full, v_full = kv_nope_full.split( + [qk_nope_head_dim, v_head_dim], dim=-1) - # SDPA expects (N, H, L, D) - q_sdpa_in = q_mha.unsqueeze(0).transpose(1, 2) - k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2) - v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2) + # Build attention inputs for full sequence + q_mha = torch.cat([q_nope, q_pe], + dim=-1) # [q_len, num_heads, total_dim] + k_pe_full_expanded = k_pe_full.expand(-1, num_q_heads, -1) + k_full = torch.cat([k_nope_full, k_pe_full_expanded], dim=-1) - # Single attention call with custom mask - sdpa_out_i = torch.nn.functional.scaled_dot_product_attention( - q_sdpa_in, - k_sdpa_in, - v_sdpa_in, - attn_mask=attn_mask, - scale=scale) - sdpa_out_i = sdpa_out_i.transpose(1, 2).squeeze(0) - sdpa_out_i = sdpa_out_i.flatten(start_dim=-2) + # Create custom attention mask: + # - Query tokens can attend to all context tokens + # - Query tokens can only attend to query tokens up to their pos + attn_mask = torch.ones(q_len, s_len, dtype=torch.bool, device=device) + # Apply causal mask only to the query portion (context_len onwards) + causal_mask = torch.tril(torch.ones(q_len, q_len, device=device)) + attn_mask[:, context_len:] = causal_mask - all_sdpa_outputs.append(sdpa_out_i) + # SDPA expects (N, H, L, D) + q_sdpa_in = q_mha.unsqueeze(0).transpose(1, 2) + k_sdpa_in = k_full.unsqueeze(0).transpose(1, 2) + v_sdpa_in = v_full.unsqueeze(0).transpose(1, 2) + + # Single attention call with custom mask + sdpa_out_i_prefill = torch.nn.functional.scaled_dot_product_attention( + q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale) + sdpa_out_i_prefill = sdpa_out_i_prefill.transpose(1, 2).squeeze(0) + sdpa_out_i_prefill = sdpa_out_i_prefill.flatten(start_dim=-2) + + for i, backend in enumerate(BACKENDS_TO_TEST): + if is_decode[i]: + all_sdpa_outputs[i].append(sdpa_out_i_decode) + else: + all_sdpa_outputs[i].append(sdpa_out_i_prefill) # Inputs for vLLM MLA backends are just the new tokens all_q_vllm.append(q_c) @@ -451,7 +448,9 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): query_vllm = torch.cat(all_q_vllm, dim=0) kv_c_vllm = torch.cat(all_kv_c_vllm, dim=0) k_pe_vllm = torch.cat(all_k_pe_vllm, dim=0) - sdpa_output = torch.cat(all_sdpa_outputs, dim=0) + sdpa_outputs = [] + for i, backend in enumerate(BACKENDS_TO_TEST): + sdpa_outputs.append(torch.cat(all_sdpa_outputs[i], dim=0)) # Create mock kv_b_proj using the same weights as reference implementation from vllm.model_executor.layers.linear import ColumnParallelLinear @@ -486,7 +485,7 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): randomize_blocks=True) # 4. Run vLLM backends and compare - for backend_name in BACKENDS_TO_TEST: + for i, backend_name in enumerate(BACKENDS_TO_TEST): backend_output = run_attention_backend( backend_name, kv_cache_spec, ["placeholder"], vllm_config, device, common_attn_metadata, query_vllm, kv_c_vllm, k_pe_vllm, kv_cache, @@ -494,12 +493,12 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): mock_kv_b_proj) # Check shape and dtype consistency - assert backend_output.shape == sdpa_output.shape, ( + assert backend_output.shape == sdpa_outputs[i].shape, ( f"[{backend_name}] shape {backend_output.shape} != " - f"SDPA shape {sdpa_output.shape}") - assert backend_output.dtype == sdpa_output.dtype, ( + f"SDPA shape {sdpa_outputs[i].shape}") + assert backend_output.dtype == sdpa_outputs[i].dtype, ( f"[{backend_name}] dtype {backend_output.dtype} != " - f"SDPA dtype {sdpa_output.dtype}") + f"SDPA dtype {sdpa_outputs[i].dtype}") assert torch.isfinite(backend_output).all(), ( f"[{backend_name}] produced non-finite values") @@ -508,12 +507,13 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str): rtol = 1e-2 atol = 5e-1 - max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item() + max_diff = torch.max(torch.abs(backend_output - + sdpa_outputs[i])).item() max_rel_diff = torch.max( - torch.abs(backend_output - sdpa_output) / - torch.abs(sdpa_output)).item() + torch.abs(backend_output - sdpa_outputs[i]) / + torch.abs(sdpa_outputs[i])).item() all_close = torch.allclose(backend_output, - sdpa_output, + sdpa_outputs[i], rtol=rtol, atol=atol) diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 6a08cdc56f736..5c49566240df4 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -139,6 +139,8 @@ def get_attention_backend(backend_name: _Backend): "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend", _Backend.FLASHMLA_VLLM_V1: "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend", + _Backend.FLASH_ATTN_MLA: + "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend", _Backend.TRITON_MLA_VLLM_V1: "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend", } diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py index f8b00565f0517..dc0af7e28e3e2 100644 --- a/vllm/attention/utils/fa_utils.py +++ b/vllm/attention/utils/fa_utils.py @@ -68,5 +68,18 @@ def flash_attn_supports_fp8() -> bool: current_platform.get_device_capability().major == 9 +def flash_attn_supports_mla(): + from vllm.platforms import current_platform + if current_platform.is_cuda(): + try: + from vllm.vllm_flash_attn.flash_attn_interface import ( + is_fa_version_supported) + return is_fa_version_supported(3) \ + and current_platform.get_device_capability()[0] == 9 + except (ImportError, AssertionError): + pass + return False + + def is_flash_attn_varlen_func_available() -> bool: return current_platform.is_cuda() or current_platform.is_xpu() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d4dd545dd43a6..71ee90040f374 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1488,6 +1488,8 @@ class EngineArgs: "TRITON_MLA", "CUTLASS_MLA", "FLASHMLA", + "FLASHMLA_VLLM_V1", + "FLASH_ATTN_MLA", "FLASHINFER", "FLASHINFER_VLLM_V1", "ROCM_AITER_MLA", diff --git a/vllm/envs.py b/vllm/envs.py index 1232bd7bf9635..56adb83e8de15 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -463,6 +463,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # - "ROCM_FLASH": use ROCmFlashAttention # - "FLASHINFER": use flashinfer # - "FLASHMLA": use FlashMLA + # - "FLASH_ATTN_MLA": use FlashAttention for MLA "VLLM_ATTENTION_BACKEND": lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index c65c987c0e488..1b0a298352cbf 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -223,9 +223,30 @@ class CudaPlatformBase(Platform): if use_mla: # TODO(lucas): refactor to be more concise # we should probably consider factoring out V1 here - if selected_backend == _Backend.CUTLASS_MLA or ( - cls.is_device_capability(100) and selected_backend is None - and block_size == 128): + + from vllm.attention.ops.flashmla import is_flashmla_supported + from vllm.attention.utils.fa_utils import flash_attn_supports_mla + + use_cutlassmla = selected_backend == _Backend.CUTLASS_MLA or ( + selected_backend is None and cls.is_device_capability(100) + and block_size == 128) + use_flashmla = selected_backend in [ + _Backend.FLASHMLA, _Backend.FLASHMLA_VLLM_V1 + ] or (selected_backend is None and is_flashmla_supported()[0]) + use_flashattn = selected_backend == _Backend.FLASH_ATTN_MLA or ( + selected_backend is None and flash_attn_supports_mla()) + use_triton = selected_backend == _Backend.TRITON_MLA or ( + selected_backend is None) + + def _get_version(name, import_suffix) -> str: + if use_v1: + logger.info_once(f"Using {name} backend on V1 engine.") + return f"vllm.v1.attention.backends.mla.{import_suffix}" + else: + logger.info_once(f"Using {name} backend.") + return f"vllm.attention.backends.{import_suffix}" + + if use_cutlassmla: if use_v1: logger.info_once("Using Cutlass MLA backend on V1 engine.") return ("vllm.v1.attention.backends.mla." @@ -233,36 +254,27 @@ class CudaPlatformBase(Platform): else: logger.warning( "Cutlass MLA backend is only supported on V1 engine") - if selected_backend == _Backend.TRITON_MLA or block_size != 64: - if use_v1: - logger.info_once("Using Triton MLA backend on V1 engine.") - return ("vllm.v1.attention.backends.mla." - "triton_mla.TritonMLABackend") - else: - logger.info("Using Triton MLA backend.") - return "vllm.attention.backends.triton_mla.TritonMLABackend" - else: - from vllm.attention.backends.flashmla import ( - is_flashmla_supported) - if not is_flashmla_supported()[0]: - logger.warning( - "FlashMLA backend is not supported due to %s", - is_flashmla_supported()[1]) - elif block_size != 64: + if use_flashmla: + if block_size != 64: logger.warning( "FlashMLA backend is not supported for block size %d" " (currently only supports block size 64).", block_size) else: - if use_v1: - logger.info_once( - "Using FlashMLA backend on V1 engine.") - return ("vllm.v1.attention.backends.mla." - "flashmla.FlashMLABackend") - else: - logger.info("Using FlashMLA backend.") - return ("vllm.attention.backends." - "flashmla.FlashMLABackend") + return _get_version("FlashMLA", "flashmla.FlashMLABackend") + if use_flashattn: + if use_v1: + logger.info_once( + "Using FlashAttention MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "flashattn_mla.FlashAttnMLABackend") + else: + logger.warning( + "FlashAttention MLA backend is only supported on V1 " + "engine.") + if use_triton: + return _get_version("Triton MLA", + "triton_mla.TritonMLABackend") if use_v1: FLASHINFER_V1 = "vllm.v1.attention.backends.flashinfer.FlashInferBackend" # noqa: E501 FLEX_ATTENTION_V1 = "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend" # noqa: E501 diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index ad12f7f788cf8..cb620542b89f3 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -52,9 +52,10 @@ class _Backend(enum.Enum): FLASHINFER_VLLM_V1 = enum.auto() TRITON_MLA = enum.auto() # Supported by V1 TRITON_MLA_VLLM_V1 = enum.auto() - FLASHMLA_VLLM_V1 = enum.auto() - FLASHMLA = enum.auto() # Supported by V1 CUTLASS_MLA = enum.auto() + FLASHMLA = enum.auto() # Supported by V1 + FLASHMLA_VLLM_V1 = enum.auto() + FLASH_ATTN_MLA = enum.auto() # Supported by V1 PALLAS = enum.auto() PALLAS_VLLM_V1 = enum.auto() IPEX = enum.auto() diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 2f275b8b23b17..fc1738579787a 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -317,7 +317,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): num_reqs = common_attn_metadata.num_reqs num_actual_tokens = common_attn_metadata.num_actual_tokens num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\ - split_decodes_and_prefills(common_attn_metadata) + split_decodes_and_prefills(common_attn_metadata, + decode_threshold=self.reorder_batch_threshold) page_size = self.page_size max_q_len = common_attn_metadata.max_query_len diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py index f08b6d7f177c7..ac0034b5dcf06 100644 --- a/vllm/v1/attention/backends/linear_attn.py +++ b/vllm/v1/attention/backends/linear_attn.py @@ -52,8 +52,9 @@ class LinearAttentionMetadataBuilder( state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( - split_decodes_and_prefills(common_attn_metadata, - decode_threshold=1)) + split_decodes_and_prefills( + common_attn_metadata, + decode_threshold=self.reorder_batch_threshold)) attn_metadata = LinearAttentionMetadata( num_prefills=num_prefills, diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py index 97a1aa86dda0d..7cbfa2c2c9a54 100644 --- a/vllm/v1/attention/backends/mamba1_attn.py +++ b/vllm/v1/attention/backends/mamba1_attn.py @@ -50,8 +50,9 @@ class Mamba1AttentionMetadataBuilder( query_start_loc.device) num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( - split_decodes_and_prefills(common_attn_metadata, - decode_threshold=1)) + split_decodes_and_prefills( + common_attn_metadata, + decode_threshold=self.reorder_batch_threshold)) has_initial_states = None padded_decodes = num_decodes diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index ed30884fdbc94..f3e6cd7430e0b 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -115,8 +115,9 @@ class Mamba2AttentionMetadataBuilder( state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( - split_decodes_and_prefills(common_attn_metadata, - decode_threshold=1)) + split_decodes_and_prefills( + common_attn_metadata, + decode_threshold=self.reorder_batch_threshold)) # Compute seq_idx, chunk_indices and chunk_offsets for prefill only if num_prefills > 0: diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 9f93b50b075b4..b4c9aae254ea0 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -578,11 +578,13 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): prefill.prefill_main = self._fi_prefill_main prefill.prefill_chunks = self._fi_prefill_chunks - def _build_decode(self, block_table_tensor: torch.Tensor, - seq_lens: torch.Tensor): + def _build_decode( + self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor, + seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor, + query_start_loc_device: torch.Tensor) -> MLACommonDecodeMetadata: return MLACommonDecodeMetadata( block_table=block_table_tensor, - seq_lens=seq_lens, + seq_lens=seq_lens_device, ) def build_for_cudagraph_capture( @@ -618,6 +620,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): query_start_loc = common_attn_metadata.query_start_loc query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu seq_lens = common_attn_metadata.seq_lens + seq_lens_cpu = common_attn_metadata.seq_lens_cpu query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] @@ -625,7 +628,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): query_seq_lens_cpu) num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \ - split_decodes_and_prefills(common_attn_metadata) + split_decodes_and_prefills(common_attn_metadata, + decode_threshold=self.reorder_batch_threshold) assert num_decodes + num_prefills == num_reqs assert num_decode_tokens + num_prefill_tokens == num_tokens @@ -725,7 +729,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): if num_decodes > 0: decode_metadata = self._build_decode( block_table_tensor=block_table_tensor[:num_decodes, ...], - seq_lens=seq_lens[:num_decodes], + seq_lens_cpu=seq_lens_cpu[:num_decodes], + seq_lens_device=seq_lens[:num_decodes], + query_start_loc_cpu=query_start_loc_cpu[:num_decodes + 1], + query_start_loc_device=query_start_loc[:num_decodes + 1], ) attn_metadata = self.metadata_cls( diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py new file mode 100644 index 0000000000000..0e08307ddf841 --- /dev/null +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from typing import ClassVar, Optional + +import torch + +from vllm.attention.backends.abstract import (AttentionLayer, AttentionType, + is_quantized_kv_cache) +from vllm.attention.utils.fa_utils import (flash_attn_supports_mla, + get_flash_attn_version) +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.v1.attention.backends.mla.common import (MLACommonBackend, + MLACommonDecodeMetadata, + MLACommonImpl, + MLACommonMetadata, + MLACommonMetadataBuilder) +from vllm.v1.kv_cache_interface import AttentionSpec +from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata + +logger = init_logger(__name__) + + +class FlashAttnMLABackend(MLACommonBackend): + + @staticmethod + def get_name() -> str: + return "FLASH_ATTN_MLA" + + @staticmethod + def get_metadata_cls() -> type["FlashAttnMLAMetadata"]: + return FlashAttnMLAMetadata + + @staticmethod + def get_builder_cls() -> type["FlashAttnMLAMetadataBuilder"]: + return FlashAttnMLAMetadataBuilder + + @staticmethod + def get_impl_cls() -> type["FlashAttnMLAImpl"]: + return FlashAttnMLAImpl + + +@dataclass +class FlashAttnMLADecodeMetadata(MLACommonDecodeMetadata): + query_start_loc: torch.Tensor + max_query_len: int + max_seq_len: int + scheduler_metadata: Optional[torch.Tensor] = None + + +@dataclass +class FlashAttnMLAMetadata(MLACommonMetadata[FlashAttnMLADecodeMetadata]): + pass + + +class FlashAttnMLAMetadataBuilder( + MLACommonMetadataBuilder[FlashAttnMLAMetadata]): + reorder_batch_threshold: ClassVar[int] = 512 + + def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], + vllm_config: VllmConfig, device: torch.device): + super().__init__(kv_cache_spec, layer_names, vllm_config, device, + FlashAttnMLAMetadata) + self.fa_aot_schedule = (get_flash_attn_version() == 3) + + def _schedule_decode(self, num_reqs, cu_query_lens, max_query_len, seqlens, + max_seq_len, causal): + if self.fa_aot_schedule: + return get_scheduler_metadata( + batch_size=num_reqs, + max_seqlen_q=max_query_len, + max_seqlen_k=max_seq_len, + num_heads_q=self.num_heads, + num_heads_kv=1, + headdim=self.mla_dims.qk_rope_head_dim, + cache_seqlens=seqlens, + qkv_dtype=self.kv_cache_spec.dtype, + headdim_v=self.mla_dims.kv_lora_rank, + page_size=self.page_size, + cu_seqlens_q=cu_query_lens, + causal=causal, + ) + return None + + def _build_decode( + self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor, + seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor, + query_start_loc_device: torch.Tensor + ) -> FlashAttnMLADecodeMetadata: + query_lens_cpu = (query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]) + max_query_len = query_lens_cpu.max().item() + max_seq_len = seq_lens_cpu.max().item() + + scheduler_metadata = self._schedule_decode( + num_reqs=seq_lens_cpu.numel(), + cu_query_lens=query_start_loc_device, + max_query_len=max_query_len, + seqlens=seq_lens_device, + max_seq_len=max_seq_len, + causal=True, + ) + + return FlashAttnMLADecodeMetadata( + block_table=block_table_tensor, + seq_lens=seq_lens_device, + query_start_loc=query_start_loc_device, + max_query_len=max_query_len, + max_seq_len=max_seq_len, + scheduler_metadata=scheduler_metadata, + ) + + +class FlashAttnMLAImpl(MLACommonImpl[FlashAttnMLAMetadata]): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[list[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + logits_soft_cap: Optional[float], + attn_type: str, + kv_sharing_target_layer_name: Optional[str], + # MLA Specific Arguments + **mla_args) -> None: + super().__init__(num_heads, head_size, scale, num_kv_heads, + alibi_slopes, sliding_window, kv_cache_dtype, + logits_soft_cap, attn_type, + kv_sharing_target_layer_name, **mla_args) + + assert flash_attn_supports_mla(), \ + "FlashAttnMLA is not supported on this device" + + unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] + if any(unsupported_features): + raise NotImplementedError( + "FlashAttnMLAImpl does not support one of the following: " + "alibi_slopes, sliding_window, logits_soft_cap") + + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "FlashAttnMLAImpl") + + if is_quantized_kv_cache(self.kv_cache_dtype): + raise NotImplementedError( + "FlashAttnMLA V1 with FP8 KV cache not yet supported") + + def _forward_decode( + self, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + attn_metadata: FlashAttnMLAMetadata, + layer: AttentionLayer, + ) -> torch.Tensor: + assert kv_c_and_k_pe_cache.numel() > 0 + assert attn_metadata.decode is not None + + if self.kv_cache_dtype.startswith("fp8"): + raise NotImplementedError( + "FP8 FlashAttention MLA not yet supported") + + kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank] + k_pe_cache = kv_c_and_k_pe_cache[..., self.kv_lora_rank:] + + o = flash_attn_varlen_func( + q=q_pe, + k=k_pe_cache.unsqueeze(-2), # Add head dim of 1 + v=kv_c_cache.unsqueeze(-2), # Add head dim of 1 + q_v=q_nope, + max_seqlen_q=attn_metadata.decode.max_query_len, + cu_seqlens_q=attn_metadata.decode.query_start_loc, + max_seqlen_k=attn_metadata.decode.max_seq_len, + seqused_k=attn_metadata.decode.seq_lens, + block_table=attn_metadata.decode.block_table, + softmax_scale=self.scale, + causal=True, + fa_version=3, # only version 3 is supported + scheduler_metadata=attn_metadata.decode.scheduler_metadata, + ) + + return self._v_up_proj(o) diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 1c50144d47900..df617ab7a8ea7 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -85,11 +85,13 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): device=self.device, dtype=torch.int32) - def _build_decode(self, block_table_tensor: torch.Tensor, - seq_lens: torch.Tensor) -> FlashMLADecodeMetadata: + def _build_decode( + self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor, + seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor, + query_start_loc_device: torch.Tensor) -> FlashMLADecodeMetadata: tile_scheduler_metadata, num_splits = \ get_mla_metadata( - seq_lens, + seq_lens_device, self.num_q_heads, 1, # MQA for the decode path ) @@ -123,7 +125,7 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): return FlashMLADecodeMetadata( block_table=block_table_tensor, - seq_lens=seq_lens, + seq_lens=seq_lens_device, tile_scheduler_metadata=tile_scheduler_metadata, num_splits=num_splits, ) diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 870cc600388e7..42670093daa9b 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -104,12 +104,14 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): dtype=torch.int32, device=device) - def _build_decode(self, block_table_tensor: torch.Tensor, - seq_lens: torch.Tensor) -> AiterMLADecodeMetadata: + def _build_decode( + self, block_table_tensor: torch.Tensor, seq_lens_cpu: torch.Tensor, + seq_lens_device: torch.Tensor, query_start_loc_cpu: torch.Tensor, + query_start_loc_device: torch.Tensor) -> AiterMLADecodeMetadata: page_size = self.kv_cache_spec.block_size - block_table_bounds = (seq_lens + page_size - 1) // page_size + block_table_bounds = (seq_lens_device + page_size - 1) // page_size device = self.device - num_reqs = seq_lens.size(0) + num_reqs = seq_lens_device.size(0) mask = (torch.arange(block_table_tensor.size(1), dtype=block_table_tensor.dtype, @@ -117,7 +119,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): < block_table_bounds.unsqueeze(1)) paged_kv_indices = block_table_tensor[mask] - paged_kv_last_page_len = seq_lens % page_size + paged_kv_last_page_len = seq_lens_device % page_size paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0, page_size, paged_kv_last_page_len) @@ -156,7 +158,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): attn_metadata = AiterMLADecodeMetadata( block_table=block_table_tensor, - seq_lens=seq_lens, + seq_lens=seq_lens_device, paged_kv_indptr=paged_kv_indptr, paged_kv_indices=paged_kv_indices, paged_kv_last_page_len=paged_kv_last_page_len, diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py index d80ced8ec876a..fcbf0c7b53560 100644 --- a/vllm/v1/attention/backends/short_conv_attn.py +++ b/vllm/v1/attention/backends/short_conv_attn.py @@ -58,8 +58,9 @@ class ShortConvAttentionMetadataBuilder( state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0] num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( - split_decodes_and_prefills(common_attn_metadata, - decode_threshold=1)) + split_decodes_and_prefills( + common_attn_metadata, + decode_threshold=self.reorder_batch_threshold)) has_initial_states = None if num_prefills > 0: #[batch,] @@ -78,4 +79,4 @@ class ShortConvAttentionMetadataBuilder( has_initial_states=has_initial_states, state_indices_tensor=state_indices_tensor, ) - return attn_metadata \ No newline at end of file + return attn_metadata diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py index 7f888c1135743..c59ff32cf7c28 100644 --- a/vllm/v1/attention/backends/xformers.py +++ b/vllm/v1/attention/backends/xformers.py @@ -3,7 +3,7 @@ """Attention layer with XFormersAttention.""" from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, ClassVar, Optional import torch @@ -197,6 +197,8 @@ class XFormersAttentionMetadata: class XFormersAttentionMetadataBuilder( AttentionMetadataBuilder[XFormersAttentionMetadata]): + reorder_batch_threshold: ClassVar[int] = 1 + def __init__( self, kv_cache_spec: AttentionSpec, @@ -212,9 +214,10 @@ class XFormersAttentionMetadataBuilder( def reorder_batch(self, input_batch: "InputBatch", scheduler_output: "SchedulerOutput") -> bool: - return reorder_batch_to_split_decodes_and_prefills(input_batch, - scheduler_output, - decode_threshold=1) + return reorder_batch_to_split_decodes_and_prefills( + input_batch, + scheduler_output, + decode_threshold=self.reorder_batch_threshold) def build( self, @@ -223,8 +226,9 @@ class XFormersAttentionMetadataBuilder( fast_build: bool = False, ) -> XFormersAttentionMetadata: num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( - split_decodes_and_prefills(common_attn_metadata, - decode_threshold=1)) + split_decodes_and_prefills( + common_attn_metadata, + decode_threshold=self.reorder_batch_threshold)) num_actual_tokens = common_attn_metadata.num_actual_tokens q_start_loc = common_attn_metadata.query_start_loc From 369a079568f8e8ced4d9dac4da2fbfdd451e778e Mon Sep 17 00:00:00 2001 From: Ignacio Sica Date: Thu, 4 Sep 2025 06:48:25 -0300 Subject: [PATCH 79/95] [Hardware][Apple-CPU] Disable OneDNN build for Apple Silicon (#24200) Signed-off-by: ignaciosica Co-authored-by: Li, Jiang --- cmake/cpu_extension.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 52bfd82c7fcfe..06494463223bd 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -88,6 +88,7 @@ is_avx512_disabled(AVX512_DISABLED) if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") message(STATUS "Apple Silicon Detected") + set(APPLE_SILICON_FOUND TRUE) set(ENABLE_NUMA OFF) check_sysctl(hw.optional.neon ASIMD_FOUND) check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND) @@ -189,7 +190,7 @@ else() set(USE_ACL OFF) endif() -if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) +if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) FetchContent_Declare( oneDNN GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git From 8f423e5f43eca54c9b0e5583ab388fd6d5cc8672 Mon Sep 17 00:00:00 2001 From: Kebe Date: Thu, 4 Sep 2025 18:49:06 +0900 Subject: [PATCH 80/95] [Feature][Response API] Add streaming support for non-harmony (#23741) Signed-off-by: Kebe --- .../openai/responses/test_basic.py | 16 + vllm/entrypoints/context.py | 10 + vllm/entrypoints/openai/serving_responses.py | 456 +++++++++++++++--- 3 files changed, 406 insertions(+), 76 deletions(-) diff --git a/tests/v1/entrypoints/openai/responses/test_basic.py b/tests/v1/entrypoints/openai/responses/test_basic.py index 7a0baa5767cba..2ee1004493a16 100644 --- a/tests/v1/entrypoints/openai/responses/test_basic.py +++ b/tests/v1/entrypoints/openai/responses/test_basic.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai # use the official client for correctness check +import openai.types.responses as openai_responses_types import pytest @@ -86,3 +87,18 @@ async def test_logprobs(client: openai.AsyncOpenAI): outputs = response.output assert outputs[-1].content[-1].logprobs assert len(outputs[-1].content[-1].logprobs[0].top_logprobs) == 5 + + +@pytest.mark.asyncio +async def test_streaming(client: openai.AsyncOpenAI): + stream = await client.responses.create( + input="What is 13 * 24?", + stream=True, + ) + events = [event async for event in stream] + assert isinstance(events[0], openai_responses_types.ResponseCreatedEvent) + assert any( + isinstance(event, openai_responses_types.ResponseTextDeltaEvent) + for event in events) + assert isinstance(events[-1], + openai_responses_types.ResponseCompletedEvent) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 52e35bcac9619..fb58cba3a40ff 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -49,9 +49,19 @@ class SimpleContext(ConversationContext): def __init__(self): self.last_output = None + self.num_prompt_tokens = 0 + self.num_output_tokens = 0 + self.num_cached_tokens = 0 + # todo num_reasoning_tokens is not implemented yet. + self.num_reasoning_tokens = 0 def append_output(self, output) -> None: self.last_output = output + if not isinstance(output, RequestOutput): + raise ValueError("SimpleContext only supports RequestOutput.") + self.num_prompt_tokens = len(output.prompt_token_ids or []) + self.num_cached_tokens = output.num_cached_tokens or 0 + self.num_output_tokens += len(output.outputs[0].token_ids or []) def need_builtin_tool_call(self) -> bool: return False diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 58424c9d9f7be..d49724b0439cf 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -4,6 +4,7 @@ import asyncio import json import time +import uuid from collections import deque from collections.abc import AsyncGenerator, AsyncIterator, Sequence from contextlib import AsyncExitStack @@ -25,7 +26,8 @@ from openai.types.responses import (ResponseCreatedEvent, ResponseOutputMessage, ResponseOutputText, ResponseReasoningItem, ResponseReasoningTextDeltaEvent, - ResponseReasoningTextDoneEvent) + ResponseReasoningTextDoneEvent, + response_text_delta_event) from openai.types.responses.response_output_text import (Logprob, LogprobTopLogprob) # yapf: enable @@ -47,7 +49,7 @@ from vllm.entrypoints.harmony_utils import ( from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable -from vllm.entrypoints.openai.protocol import (ErrorResponse, +from vllm.entrypoints.openai.protocol import (DeltaMessage, ErrorResponse, InputTokensDetails, OutputTokensDetails, RequestResponseMetadata, @@ -459,10 +461,6 @@ class OpenAIServingResponses(OpenAIServing): assert isinstance(context, HarmonyContext) output = self._make_response_output_items_with_harmony(context) # TODO: these are all 0 for now! - num_prompt_tokens = context.num_prompt_tokens - num_generated_tokens = context.num_output_tokens - num_cached_tokens = context.num_cached_tokens - num_reasoning_tokens = context.num_reasoning_tokens else: assert isinstance(context, SimpleContext) final_res = context.last_output @@ -475,10 +473,11 @@ class OpenAIServingResponses(OpenAIServing): # Calculate usage. assert final_res.prompt_token_ids is not None - num_prompt_tokens = len(final_res.prompt_token_ids) - num_generated_tokens = len(final_output.token_ids) - num_cached_tokens = final_res.num_cached_tokens - num_reasoning_tokens = 0 + assert isinstance(context, (SimpleContext, HarmonyContext)) + num_prompt_tokens = context.num_prompt_tokens + num_generated_tokens = context.num_output_tokens + num_cached_tokens = context.num_cached_tokens + num_reasoning_tokens = context.num_reasoning_tokens usage = ResponseUsage( input_tokens=num_prompt_tokens, @@ -553,6 +552,28 @@ class OpenAIServingResponses(OpenAIServing): )) return out + def _create_stream_response_logprobs( + self, + token_ids: Sequence[int], + logprobs: Optional[SampleLogprobs], + tokenizer: AnyTokenizer, + top_logprobs: Optional[int] = None + ) -> list[response_text_delta_event.Logprob]: + lgs = self._create_response_logprobs(token_ids=token_ids, + logprobs=logprobs, + tokenizer=tokenizer, + top_logprobs=top_logprobs) + return [ + response_text_delta_event.Logprob( + token=lg.token, + logprob=lg.logprob, + top_logprobs=[ + response_text_delta_event.LogprobTopLogprob( + token=tl.token, logprob=tl.logprob) + for tl in lg.top_logprobs + ]) for lg in lgs + ] + def _make_response_output_items( self, request: ResponsesRequest, @@ -912,7 +933,7 @@ class OpenAIServingResponses(OpenAIServing): status_code=HTTPStatus.BAD_REQUEST, ) - async def _process_streaming_events( + async def _process_simple_streaming_events( self, request: ResponsesRequest, sampling_params: SamplingParams, @@ -922,47 +943,292 @@ class OpenAIServingResponses(OpenAIServing): tokenizer: AnyTokenizer, request_metadata: RequestResponseMetadata, created_time: int, + _send_event: Callable[[BaseModel], str], ) -> AsyncGenerator[str, None]: - sequence_number = 0 + current_content_index = 0 + current_output_index = 0 + current_item_id = "" + reasoning_parser = None + if self.reasoning_parser: + reasoning_parser = self.reasoning_parser(tokenizer) + previous_text = "" + previous_token_ids: list[int] = [] + first_delta_sent = False + previous_delta_messages: list[DeltaMessage] = [] + async for ctx in result_generator: + assert isinstance(ctx, SimpleContext) + if ctx.last_output is None: + continue + if ctx.last_output.outputs: + output = ctx.last_output.outputs[0] + if reasoning_parser: + delta_message = \ + reasoning_parser.extract_reasoning_content_streaming( + previous_text=previous_text, + current_text=previous_text + output.text, + delta_text=output.text, + previous_token_ids=previous_token_ids, + current_token_ids=previous_token_ids + + output.token_ids, + delta_token_ids=output.token_ids, + ) + else: + delta_message = DeltaMessage(content=output.text, ) + previous_text += output.text + previous_token_ids += output.token_ids + if not delta_message: + continue + if not first_delta_sent: + current_item_id = str(uuid.uuid4()) + if delta_message.reasoning_content: + yield _send_event( + openai_responses_types. + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=openai_responses_types. + ResponseReasoningItem( + type="reasoning", + id=current_item_id, + summary=[], + status="in_progress", + ), + )) + else: + yield _send_event( + openai_responses_types. + ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=openai_responses_types. + ResponseOutputMessage( + id=current_item_id, + type="message", + role="assistant", + content=[], + status="in_progress", + ), + )) + yield _send_event( + openai_responses_types.ResponseContentPartAddedEvent( + type="response.content_part.added", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + content_index=current_content_index, + part=openai_responses_types.ResponseOutputText( + type="output_text", + text="", + annotations=[], + logprobs=[], + ), + )) + current_content_index += 1 + first_delta_sent = True + # todo(kebe7jun) tool call support - def _send_event(event: BaseModel): - nonlocal sequence_number - # Set sequence_number if the event has this attribute - if hasattr(event, 'sequence_number'): - event.sequence_number = sequence_number - sequence_number += 1 - # Get event type from the event's type field if it exists - event_type = getattr(event, 'type', 'unknown') - return (f"event: {event_type}\n" - f"data: {event.model_dump_json(indent=None)}\n\n") + # check delta message and previous delta message are + # same as content or reasoning content + if (previous_delta_messages + and previous_delta_messages[-1].reasoning_content + is not None and delta_message.content is not None): + # from reasoning to normal content, send done + # event for reasoning + reason_content = ''.join( + pm.reasoning_content for pm in previous_delta_messages + if pm.reasoning_content is not None) + yield _send_event( + ResponseReasoningTextDoneEvent( + type="response.reasoning_text.done", + item_id=current_item_id, + sequence_number=-1, + output_index=current_output_index, + content_index=current_content_index, + text=reason_content, + )) + current_content_index = 0 + reasoning_item = ResponseReasoningItem( + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=reason_content, + type="reasoning_text", + ), + ], + status="completed", + id=current_item_id, + summary=[], + ) + yield _send_event( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=reasoning_item, + )) + yield _send_event( + openai_responses_types.ResponseOutputItemAddedEvent( + type="response.output_item.added", + sequence_number=-1, + output_index=current_output_index, + item=openai_responses_types.ResponseOutputMessage( + id=current_item_id, + type="message", + role="assistant", + content=[], + status="in_progress", + ), + )) + current_output_index += 1 + current_item_id = str(uuid.uuid4()) + yield _send_event( + openai_responses_types.ResponseContentPartAddedEvent( + type="response.content_part.added", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + content_index=current_content_index, + part=openai_responses_types.ResponseOutputText( + type="output_text", + text="", + annotations=[], + logprobs=[], + ), + )) + current_content_index += 1 + # reset previous delta messages + previous_delta_messages = [] + if delta_message.reasoning_content is not None: + yield _send_event( + ResponseReasoningTextDeltaEvent( + type="response.reasoning_text.delta", + sequence_number=-1, + content_index=current_content_index, + output_index=current_output_index, + item_id=current_item_id, + delta=delta_message.reasoning_content, + )) + elif delta_message.content is not None: + yield _send_event( + openai_responses_types.ResponseTextDeltaEvent( + type="response.output_text.delta", + sequence_number=-1, + content_index=current_content_index, + output_index=current_output_index, + item_id=current_item_id, + delta=delta_message.content, + logprobs=self._create_stream_response_logprobs( + token_ids=output.token_ids, + logprobs=output.logprobs, + tokenizer=tokenizer, + top_logprobs=request.top_logprobs, + ) if request.is_include_output_logprobs() else [], + )) + current_content_index += 1 + + previous_delta_messages.append(delta_message) + if previous_delta_messages: + if previous_delta_messages[-1].reasoning_content is not None: + reason_content = ''.join(pm.reasoning_content + for pm in previous_delta_messages + if pm.reasoning_content is not None) + yield _send_event( + ResponseReasoningTextDoneEvent( + type="response.reasoning_text.done", + item_id=current_item_id, + sequence_number=-1, + output_index=current_output_index, + content_index=current_content_index, + text=reason_content, + )) + current_content_index += 1 + reasoning_item = ResponseReasoningItem( + type="reasoning", + content=[ + ResponseReasoningTextContent( + text=reason_content, + type="reasoning_text", + ), + ], + status="completed", + id=current_item_id, + summary=[], + ) + yield _send_event( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=reasoning_item, + )) + elif previous_delta_messages[-1].content is not None: + final_content = ''.join(pm.content + for pm in previous_delta_messages + if pm.content is not None) + yield _send_event( + openai_responses_types.ResponseTextDoneEvent( + type="response.output_text.done", + sequence_number=-1, + output_index=current_output_index, + content_index=current_content_index, + text=final_content, + logprobs=[], + item_id=current_item_id, + )) + current_content_index += 1 + part = ResponseOutputText( + text=final_content, + type="output_text", + annotations=[], + ) + yield _send_event( + openai_responses_types.ResponseContentPartDoneEvent( + type="response.content_part.done", + sequence_number=-1, + item_id=current_item_id, + output_index=current_output_index, + content_index=current_content_index, + part=part, + )) + current_content_index += 1 + item = ResponseOutputMessage( + type="message", + role="assistant", + content=[ + part, + ], + status="completed", + id=current_item_id, + summary=[], + ) + yield _send_event( + ResponseOutputItemDoneEvent( + type="response.output_item.done", + sequence_number=-1, + output_index=current_output_index, + item=item, + )) + + async def _process_harmony_streaming_events( + self, + request: ResponsesRequest, + sampling_params: SamplingParams, + result_generator: AsyncIterator[Optional[ConversationContext]], + context: ConversationContext, + model_name: str, + tokenizer: AnyTokenizer, + request_metadata: RequestResponseMetadata, + created_time: int, + _send_event: Callable[[BaseModel], str], + ) -> AsyncGenerator[str, None]: current_content_index = 0 # FIXME: this number is never changed current_output_index = 0 current_item_id = "" # FIXME: this number is never changed sent_output_item_added = False - initial_response = ResponsesResponse.from_request( - request, - sampling_params, - model_name=model_name, - created_time=created_time, - output=[], - status="in_progress", - usage=None, - ).model_dump() - yield _send_event( - ResponseCreatedEvent( - type="response.created", - sequence_number=-1, - response=initial_response, - )) - yield _send_event( - ResponseInProgressEvent( - type="response.in_progress", - sequence_number=-1, - response=initial_response, - )) - async for ctx in result_generator: assert isinstance(ctx, StreamingHarmonyContext) @@ -1312,29 +1578,6 @@ class OpenAIServingResponses(OpenAIServing): ), )) - async def empty_async_generator(): - # A hack to trick Python to think this is a generator but in fact - # it immediately returns. - if False: - yield - - final_response = await self.responses_full_generator( - request, - sampling_params, - empty_async_generator(), - context, - model_name, - tokenizer, - request_metadata, - created_time=created_time, - ) - yield _send_event( - openai_responses_types.ResponseCompletedEvent( - type="response.completed", - sequence_number=-1, - response=final_response.model_dump(), - )) - async def responses_stream_generator( self, request: ResponsesRequest, @@ -1349,16 +1592,77 @@ class OpenAIServingResponses(OpenAIServing): # TODO: # 1. Handle disconnect - if not isinstance(context, StreamingHarmonyContext): - raise NotImplementedError( - "Streaming is not supported for responses API without Harmony." - ) - created_time = created_time or int(time.time()) + sequence_number = 0 + + def _send_event(event: BaseModel): + nonlocal sequence_number + # Set sequence_number if the event has this attribute + if hasattr(event, 'sequence_number'): + event.sequence_number = sequence_number + sequence_number += 1 + # Get event type from the event's type field if it exists + event_type = getattr(event, 'type', 'unknown') + return (f"event: {event_type}\n" + f"data: {event.model_dump_json(indent=None)}\n\n") + async with AsyncExitStack() as exit_stack: - await context.init_tool_sessions(self.tool_server, exit_stack) - async for event_data in self._process_streaming_events( - request, sampling_params, result_generator, context, - model_name, tokenizer, request_metadata, created_time): + processer = None + if self.use_harmony: + await context.init_tool_sessions(self.tool_server, exit_stack) + processer = self._process_harmony_streaming_events + else: + processer = self._process_simple_streaming_events + + initial_response = ResponsesResponse.from_request( + request, + sampling_params, + model_name=model_name, + created_time=created_time, + output=[], + status="in_progress", + usage=None, + ).model_dump() + yield _send_event( + ResponseCreatedEvent( + type="response.created", + sequence_number=-1, + response=initial_response, + )) + yield _send_event( + ResponseInProgressEvent( + type="response.in_progress", + sequence_number=-1, + response=initial_response, + )) + + async for event_data in processer(request, sampling_params, + result_generator, context, + model_name, tokenizer, + request_metadata, created_time, + _send_event): yield event_data + + async def empty_async_generator(): + # A hack to trick Python to think this is a generator but + # in fact it immediately returns. + if False: + yield + + final_response = await self.responses_full_generator( + request, + sampling_params, + empty_async_generator(), + context, + model_name, + tokenizer, + request_metadata, + created_time=created_time, + ) + yield _send_event( + openai_responses_types.ResponseCompletedEvent( + type="response.completed", + sequence_number=-1, + response=final_response.model_dump(), + )) From 6c7af8110a2c383f18a9523bc26c26d3f599ff2e Mon Sep 17 00:00:00 2001 From: TJian Date: Thu, 4 Sep 2025 02:58:18 -0700 Subject: [PATCH 81/95] [Doc] Update vLLM Singapore Meetup info (#24234) Signed-off-by: tjtanaa --- README.md | 1 + docs/community/meetups.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 8812aac4ea266..e13993efd3702 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Easy, fast, and cheap LLM serving for everyone *Latest News* 🔥 +- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing). - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH). - [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). - [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152). diff --git a/docs/community/meetups.md b/docs/community/meetups.md index 221a7bd96213f..04919769e173f 100644 --- a/docs/community/meetups.md +++ b/docs/community/meetups.md @@ -2,6 +2,7 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing) - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH) - [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152). From eafa8dcde63d625350ed618db4dd1cbcbaae77a1 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Thu, 4 Sep 2025 18:58:26 +0800 Subject: [PATCH 82/95] [Model] Add pp support for hunyuan (#24212) Signed-off-by: zjy0516 --- docs/models/supported_models.md | 4 ++-- vllm/model_executor/models/hunyuan_v1.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index c8f628d31abf5..c25c4b52197eb 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -365,8 +365,8 @@ th { | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ | | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ | | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | -| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ | -| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ | +| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | ✅︎ | ✅︎ | +| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ | | `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ | | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index fbba849a76f23..a74a44bc2b511 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -56,7 +56,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsLoRA +from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_layers) @@ -841,7 +841,7 @@ class HunYuanModel(nn.Module): return loaded_params -class HunYuanV1Base(nn.Module, SupportsLoRA): +class HunYuanV1Base(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { "qkv_proj": [ "q_proj", From 2b30afa4420cbada6dd9084de3ee7eb19142b7ff Mon Sep 17 00:00:00 2001 From: nopperl <54780682+nopperl@users.noreply.github.com> Date: Thu, 4 Sep 2025 20:59:16 +0900 Subject: [PATCH 83/95] Use hidden_size_per_head as head_size fallback (#24221) Signed-off-by: nopperl <54780682+nopperl@users.noreply.github.com> --- vllm/config/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 7c2b497022658..941aff8919a92 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1426,6 +1426,11 @@ class ModelConfig: if getattr(self.hf_text_config, "head_dim", None) is not None: return self.hf_text_config.head_dim + # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head` + if getattr(self.hf_text_config, "hidden_size_per_head", + None) is not None: + return self.hf_text_config.hidden_size_per_head + # FIXME(woosuk): This may not be true for all models. return (self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads) From 16ded21eeb578cdd7060a32d21fe213d8f31cd0c Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 4 Sep 2025 20:41:08 +0800 Subject: [PATCH 84/95] [XPU] support Triton Attention backend on Intel GPU (#24149) Signed-off-by: Kunshang Ji --- .../scripts/hardware_ci/run-xpu-test.sh | 9 +++--- vllm/_ipex_ops.py | 5 ++-- vllm/attention/ops/paged_attn.py | 7 ++++- vllm/platforms/xpu.py | 28 +++++++++++++++++-- vllm/v1/attention/backends/triton_attn.py | 15 ++++++---- 5 files changed, 49 insertions(+), 15 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 73f3e63fbf5f6..efcd10acf0b93 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -30,10 +30,11 @@ docker run \ bash -c ' set -e echo $ZE_AFFINITY_MASK - VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager - VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE - VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray - VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp + VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager cd tests pytest -v -s v1/core pytest -v -s v1/engine diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 19f6c4e3060ce..c2868c040aa16 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -242,10 +242,9 @@ class ipex_ops: k_scale_float: float = 1.0, v_scale_float: float = 1.0, ) -> None: - assert kv_cache_dtype == "auto" - # TODO: support FP8 kv cache. ipex.llm.modules.PagedAttention.reshape_and_cache_flash( - key, value, key_cache, value_cache, slot_mapping) + key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, + k_scale_float, v_scale_float) @staticmethod def flash_attn_varlen_func( diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index c6d1501e27578..4d870a45e5800 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -6,9 +6,14 @@ from typing import List, Optional, Tuple import torch -from vllm import _custom_ops as ops +from vllm.platforms import current_platform from vllm.triton_utils import HAS_TRITON +if current_platform.is_cuda_alike(): + from vllm import _custom_ops as ops +elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops as ops + if HAS_TRITON: from vllm.attention.ops.prefix_prefill import context_attention_fwd diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 645a9e63a4e5a..9f89334e9a8a8 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -37,14 +37,38 @@ class XPUPlatform(Platform): dtype: torch.dtype, kv_cache_dtype: Optional[str], block_size: int, use_v1: bool, use_mla: bool, has_sink: bool) -> str: - if selected_backend is not None and selected_backend != _Backend.IPEX: - logger.info("Cannot use %s backend on XPU.", selected_backend) use_v1 = envs.VLLM_USE_V1 if not use_v1: raise ValueError("XPU backend only supports V1.") + TRITON_ATTN_VLLM_V1 = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501 + FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501 + if selected_backend == _Backend.TRITON_ATTN_VLLM_V1: + logger.info_once("Using Triton backend on V1 engine.") + return TRITON_ATTN_VLLM_V1 + elif selected_backend == _Backend.FLASH_ATTN: + logger.info_once("Using Flash Attention backend on V1 engine.") + return FLASH_ATTN_V1 + elif selected_backend: + raise ValueError( + f"Invalid attention backend for {cls.device_name}, " + f"with use_v1: {use_v1} use_mla: {use_mla}") + logger.info("Using Flash Attention backend on V1 engine.") return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" + @classmethod + def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str, + model_config: "ModelConfig") -> bool: + """ + Check if the kv_cache_dtype is supported. + XPU only support fp8 kv cache with triton backend. + """ + if envs.is_set("VLLM_ATTENTION_BACKEND") and \ + envs.VLLM_ATTENTION_BACKEND == "TRITON_ATTN_VLLM_V1": + return kv_cache_dtype in ["fp8_e4m3", "fp8_e5m2", "fp8"] + + return False + @classmethod def set_device(cls, device: torch.device) -> None: """ diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index a37a7f6811ef9..104cebb45d740 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -7,7 +7,6 @@ from typing import ClassVar, Optional import torch -from vllm import _custom_ops as ops from vllm import envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) @@ -23,6 +22,11 @@ from vllm.v1.attention.backends.utils import (AttentionCGSupport, CommonAttentionMetadata) from vllm.v1.kv_cache_interface import AttentionSpec +if current_platform.is_cuda_alike(): + from vllm import _custom_ops as ops +elif current_platform.is_xpu(): + from vllm._ipex_ops import ipex_ops as ops + logger = init_logger(__name__) @@ -337,7 +341,7 @@ class TritonAttentionImpl(AttentionImpl): layer._v_scale, ) else: - torch.ops._C_cache_ops.reshape_and_cache_flash( + ops.reshape_and_cache_flash( key, value, key_cache, @@ -354,9 +358,10 @@ class TritonAttentionImpl(AttentionImpl): num_tokens, num_heads, head_size = query.shape assert layer._q_scale == 1.0, \ "A non 1.0 q_scale is not currently supported." - if not current_platform.is_rocm(): - # Skip Q quantization on ROCm, since dequantizing back to - # f32 in the attention kernel is not supported. + if current_platform.is_cuda(): + # Skip Q quantization on ROCm and XPU, enable this on cuda + # only, since dequantizing back to f32 in the attention kernel + # is not supported. query, _ = ops.scaled_fp8_quant( query.reshape( (num_tokens, num_heads * head_size)).contiguous(), From c9f7081f9c848d83ecbf42b57591451d6ff5a7a9 Mon Sep 17 00:00:00 2001 From: Yash Pratap Singh Date: Thu, 4 Sep 2025 18:20:50 +0530 Subject: [PATCH 85/95] [LoRA]: Add lora support to qwen-2.5-omni (#24231) --- docs/models/supported_models.md | 2 +- .../model_executor/models/qwen2_5_omni_thinker.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index c25c4b52197eb..9db6f8036a73b 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -662,7 +662,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ | | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎ | +| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | ✅︎ | | `RForConditionalGeneration` | R-VL-4B | T + IE+ | `YannQi/R-4B` | | ✅︎ | ✅︎ | | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 5c64c81547e65..29563540a7942 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -41,6 +41,7 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen2_5_vl import ( Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs, Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs, @@ -66,7 +67,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens -from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -705,7 +707,7 @@ class Qwen2_5OmniConditionalGenerationMixin: dummy_inputs=Qwen2_5OmniThinkerDummyInputsBuilder, ) class Qwen2_5OmniThinkerForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsPP, + nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, Qwen2_5OmniConditionalGenerationMixin): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ @@ -798,6 +800,15 @@ class Qwen2_5OmniThinkerForConditionalGeneration( def get_language_model(self) -> torch.nn.Module: return self.language_model + def get_mm_mapping(self) -> MultiModelKeys: + """Get module prefix for multimodal models to filter LoRA modules.""" + return MultiModelKeys.from_string_field( + language_model="language_model", + connector=[], # No explicit connector in this model + tower_model=["visual", + "audio_tower"], # Exclude vision and audio towers + ) + def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings: From 37241077d563d5808ce8f36ebce4eec25e1c1f88 Mon Sep 17 00:00:00 2001 From: nvjullin Date: Thu, 4 Sep 2025 21:25:40 +0800 Subject: [PATCH 86/95] [Misc] Removed force_fp8_e4m3fnuz from FP8LinearOp (#23725) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Julien Lin Signed-off-by: Luka Govedič Co-authored-by: Luka Govedič --- tests/compile/test_fusion.py | 27 +++++++++++-------- tests/compile/test_silu_mul_quant_fusion.py | 27 ++++++++++--------- tests/utils.py | 9 +++++++ .../layers/quantization/ptpc_fp8.py | 6 ++--- .../layers/quantization/utils/w8a8_utils.py | 6 ++--- 5 files changed, 45 insertions(+), 30 deletions(-) diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index c4229f93464ac..eedb9bdcd5299 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -15,9 +15,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, QuantKey, ScaleDesc) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - Fp8LinearOp, maybe_create_device_identity) + Fp8LinearOp, cutlass_fp8_supported, maybe_create_device_identity) from vllm.platforms import current_platform +from ..utils import override_cutlass_fp8_supported from .backend import TestBackend FP8_DTYPE = current_platform.fp8_dtype() @@ -26,9 +27,9 @@ FP8_DTYPE = current_platform.fp8_dtype() class TestModel(torch.nn.Module): def __init__(self, hidden_size: int, eps: float, static: bool, - force_fp8_e4m3fnuz: bool, *args, **kwargs): + cuda_force_torch: bool, *args, **kwargs): super().__init__(*args, **kwargs) - self.force_fp8_e4m3fnuz = force_fp8_e4m3fnuz + self.cuda_force_torch = cuda_force_torch self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)] self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)] group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN @@ -42,11 +43,12 @@ class TestModel(torch.nn.Module): torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() for _ in range(2) ] - self.fp8_linear = Fp8LinearOp( - force_fp8_e4m3fnuz=force_fp8_e4m3fnuz, - act_quant_static=static, - act_quant_group_shape=group_shape, - ) + + with override_cutlass_fp8_supported(not cuda_force_torch): + self.fp8_linear = Fp8LinearOp( + act_quant_static=static, + act_quant_group_shape=group_shape, + ) def forward(self, x): resid = torch.sqrt(x) @@ -81,11 +83,14 @@ class TestModel(torch.nn.Module): @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049]) @pytest.mark.parametrize("eps", [1e-5, 1e-6]) @pytest.mark.parametrize("static", [True, False]) -@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False]) +# cuda_force_torch used to test torch code path on platforms that +# cutlass_fp8_supported() == True. +@pytest.mark.parametrize("cuda_force_torch", + [True, False] if cutlass_fp8_supported() else [True]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm") def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, - force_fp8_e4m3fnuz): + cuda_force_torch): torch.set_default_device("cuda") torch.set_default_dtype(dtype) torch.manual_seed(1) @@ -102,7 +107,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static, fusion_pass = FusionPass.instance(vllm_config) backend = TestBackend(noop_pass, fusion_pass) - model = TestModel(hidden_size, eps, static, force_fp8_e4m3fnuz) + model = TestModel(hidden_size, eps, static, cuda_force_torch) # First dimension dynamic x = torch.rand(num_tokens, hidden_size) diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index fcc2589e42116..e16d1725e6add 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -17,9 +17,10 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, kFp8StaticTensorSym, kNvfp4Quant) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - Fp8LinearOp) + Fp8LinearOp, cutlass_fp8_supported) from vllm.platforms import current_platform +from ..utils import override_cutlass_fp8_supported from .backend import TestBackend FP8_DTYPE = current_platform.fp8_dtype() @@ -32,7 +33,7 @@ def is_nvfp4_supported(): class TestSiluMulFp8QuantModel(torch.nn.Module): - def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, **kwargs): + def __init__(self, hidden_size: int, cuda_force_torch: bool, **kwargs): super().__init__() self.silu_and_mul = SiluAndMul() self.wscale = torch.rand(1, dtype=torch.float32) @@ -40,11 +41,11 @@ class TestSiluMulFp8QuantModel(torch.nn.Module): self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() - self.fp8_linear = Fp8LinearOp( - force_fp8_e4m3fnuz=force_fp8_e4m3fnuz, - act_quant_static=True, - act_quant_group_shape=GroupShape.PER_TENSOR, - ) + with override_cutlass_fp8_supported(not cuda_force_torch): + self.fp8_linear = Fp8LinearOp( + act_quant_static=True, + act_quant_group_shape=GroupShape.PER_TENSOR, + ) def forward(self, x): y = self.silu_and_mul(x) @@ -96,12 +97,15 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module): @pytest.mark.parametrize( "model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel] if is_nvfp4_supported() else [TestSiluMulFp8QuantModel]) -@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False]) +# cuda_force_torch used to test torch code path on platforms that +# cutlass_fp8_supported() == True. +@pytest.mark.parametrize("cuda_force_torch", + [True, False] if cutlass_fp8_supported() else [True]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm") def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class, - force_fp8_e4m3fnuz): - if model_class == TestSiluMulNvfp4QuantModel and force_fp8_e4m3fnuz: + cuda_force_torch): + if model_class == TestSiluMulNvfp4QuantModel and cuda_force_torch: pytest.skip("Duplicate tests for NVFP4") torch.set_default_device("cuda") @@ -114,8 +118,7 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class, fusion_pass = ActivationQuantFusionPass(config) backend = TestBackend(NoOpEliminationPass(config), fusion_pass) - model = model_class(hidden_size=hidden_size, - force_fp8_e4m3fnuz=force_fp8_e4m3fnuz) + model = model_class(hidden_size, cuda_force_torch) # First dimension dynamic x = torch.rand(num_tokens, hidden_size * 2) diff --git a/tests/utils.py b/tests/utils.py index 9d2073f3c1036..e47235002657d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -17,6 +17,7 @@ from contextlib import contextmanager, suppress from multiprocessing import Process from pathlib import Path from typing import Any, Callable, Literal, Optional, Union +from unittest.mock import patch import cloudpickle import httpx @@ -1077,3 +1078,11 @@ def get_attn_backend_list_based_on_platform() -> list[str]: return attn_backend_list else: raise ValueError("Unsupported platform") + + +@contextmanager +def override_cutlass_fp8_supported(value: bool): + with patch( + "vllm.model_executor.layers.quantization.utils.w8a8_utils.cutlass_fp8_supported", + return_value=value): + yield diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index 466fd5fba7685..45ea8e3520f1d 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -92,13 +92,13 @@ class PTPCFp8LinearMethod(Fp8LinearMethod): """ def __init__(self, quant_config: PTPCFp8Config): + assert current_platform.is_rocm(), \ + "PTPCFp8LinearMethod is only supported on ROCm." super().__init__(quant_config=quant_config) # Force weight quantization self.quant_config.is_checkpoint_fp8_serialized = False self.fp8_linear = Fp8LinearOp( - act_quant_static=False, - act_quant_group_shape=GroupShape.PER_TOKEN, - force_fp8_e4m3fnuz=True) + act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.weight = torch.nn.Parameter(layer.weight.data, diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index ecdcc573935c0..8f6b7f83d47f8 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -355,12 +355,10 @@ class Fp8LinearOp: def __init__(self, act_quant_static: bool, act_quant_group_shape: GroupShape = GroupShape.PER_TENSOR, - pad_output: Optional[bool] = None, - force_fp8_e4m3fnuz: bool = False): + pad_output: Optional[bool] = None): if current_platform.is_rocm(): self.preferred_backend = "rocm" - elif current_platform.is_cuda( - ) and not force_fp8_e4m3fnuz and cutlass_fp8_supported(): + elif current_platform.is_cuda() and cutlass_fp8_supported(): if has_flashinfer() and current_platform.has_device_capability( 100): self.preferred_backend = "flashinfer" From e41a0fa3772c5b9bbca439ce5ed7d05803febbc1 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 4 Sep 2025 07:55:23 -0700 Subject: [PATCH 87/95] [Perf] Freeze core engine proc heap after init (#24008) Signed-off-by: Nick Hill --- vllm/v1/engine/core.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 922c06b44be88..d7e9cfa3660b1 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import gc import os import queue import signal @@ -536,6 +537,11 @@ class EngineCoreProc(EngineCore): self.step_fn = (self.step if self.batch_queue is None else self.step_with_batch_queue) + # Mark the startup heap as static so that it's ignored by GC. + # Reduces pause times of oldest generation collections. + gc.collect() + gc.freeze() + @contextmanager def _perform_handshakes( self, From 83609ca91d42c8847d1b4c272b011a0b6c27319e Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Thu, 4 Sep 2025 17:52:17 +0200 Subject: [PATCH 88/95] [Doc]: fix typos in Python comments (#24173) Signed-off-by: Didier Durand Co-authored-by: Russell Bryant Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- benchmarks/benchmark_dataset.py | 2 +- benchmarks/kernels/benchmark_lora.py | 2 +- benchmarks/multi_turn/benchmark_serving_multi_turn.py | 2 +- examples/offline_inference/audio_language.py | 2 +- tests/models/multimodal/generation/vlm_utils/builders.py | 2 +- .../models/multimodal/generation/vlm_utils/case_filtering.py | 2 +- vllm/attention/backends/mla/common.py | 2 +- vllm/engine/async_llm_engine.py | 2 +- vllm/engine/multiprocessing/client.py | 2 +- vllm/model_executor/layers/quantization/awq_triton.py | 2 +- vllm/model_executor/layers/quantization/base_config.py | 2 +- vllm/v1/attention/backends/mla/common.py | 4 ++-- 12 files changed, 13 insertions(+), 13 deletions(-) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 2ea4f9ccaff2b..64ffa62c04d85 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -403,7 +403,7 @@ class RandomDataset(BenchmarkDataset): # [6880, 6881] -> ['Ġcalls', 'here'] -> # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] # To avoid uncontrolled change of the prompt length, - # the encoded sequence is truncated before being decode again. + # the encoded sequence is truncated before being decoded again. total_input_len = prefix_len + int(input_lens[i]) re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ :total_input_len diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 3d38d4b3534e8..89309c79f0991 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -637,7 +637,7 @@ def bench_optype( # Clear LoRA optimization hash-maps. _LORA_A_PTR_DICT.clear() _LORA_B_PTR_DICT.clear() - # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup + # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up for kwargs in kwargs_list: op_type.bench_fn()(**kwargs) torch.cuda.synchronize() diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index d23b7b6e4571d..66d85eaf51312 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -962,7 +962,7 @@ async def main_mp( # At this point all the clients finished, # collect results (TTFT, TPOT, etc.) from all the clients. - # This needs to happens before calling join on the clients + # This needs to happen before calling join on the clients # (result_queue should be emptied). while not result_queue.empty(): client_metrics.append(result_queue.get()) diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index a5b8397e7e7ff..65a87d2dd9e8e 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -117,7 +117,7 @@ def run_gemma3n(question: str, audio_count: int) -> ModelRequestData: # Granite Speech def run_granite_speech(question: str, audio_count: int) -> ModelRequestData: - # NOTE - the setting in this example are somehat different than what is + # NOTE - the setting in this example are somewhat different from what is # optimal for granite speech, and it is generally recommended to use beam # search. Check the model README for suggested settings. # https://huggingface.co/ibm-granite/granite-speech-3.3-8b diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py index 03c08240d6a81..133d5d6ee2ef8 100644 --- a/tests/models/multimodal/generation/vlm_utils/builders.py +++ b/tests/models/multimodal/generation/vlm_utils/builders.py @@ -250,7 +250,7 @@ def build_video_inputs_from_test_info( def apply_image_size_scaling(image, size: Union[float, tuple[int, int]], size_type: SizeType): - """Applies a size scaler to one image; this can be a an image size factor, + """Applies a size scaler to one image; this can be an image size factor, which scales the image while maintaining the aspect ratio""" # Special case for embeddings; if it's a tensor, it's only valid if we # are considering size factors at constant scale, i.e., we just clone diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py index 336e2dd2b1201..1edb512135343 100644 --- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py +++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py @@ -42,7 +42,7 @@ def get_filtered_test_settings( else: assert test_info.prompt_formatter is not None - # Everything looks okay; keep if this is has correct proc handling + # Everything looks okay; keep if this is correct proc handling if (test_info.distributed_executor_backend is not None) == new_proc_per_test: matching_tests[test_name] = test_info diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index c5ed4c6e40326..3b9037521168e 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -822,7 +822,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]): and context_lens_tensor is not None \ and context_lens_tensor[:self.num_prefills].max() > 0: - # NOTE: it is recommend you read the `Chunked Prefill` section in + # NOTE: it is recommended you read the `Chunked Prefill` section in # the comment at the top of the file before trying to understand # the following code diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 9f9ad1854c3b6..6010a4647a0af 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -717,7 +717,7 @@ class AsyncLLMEngine(EngineClient): # Stop the execute model loop in parallel workers until there # are more requests to process. This avoids waiting # indefinitely in torch.distributed ops which may otherwise - # timeout, and unblocks the RPC thread in the workers so that + # time out, and unblocks the RPC thread in the workers so that # they can process any other queued control plane messages, # such as add/remove lora adapters. await engine.engine.stop_remote_worker_execution_loop_async() diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 2d3248859c940..0beb9c8cc0b97 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -270,7 +270,7 @@ class MQLLMEngineClient(EngineClient): queue.put_nowait(request_output) async def setup(self): - """Setup the client before it starts sending server requests.""" + """Set up the client before it starts sending server requests.""" # Start output_loop if self.output_loop is None: diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py index ebc526d6db2f9..2e8894436a985 100644 --- a/vllm/model_executor/layers/quantization/awq_triton.py +++ b/vllm/model_executor/layers/quantization/awq_triton.py @@ -19,7 +19,7 @@ def awq_dequantize_kernel( num_rows, # input num rows in qweight BLOCK_SIZE_X: tl.constexpr, BLOCK_SIZE_Y: tl.constexpr): - # Setup the pids. + # Set up the pids. pid_x = tl.program_id(axis=0) pid_y = tl.program_id(axis=1) diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 4a43351260e9f..6fd94afbe5566 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -128,7 +128,7 @@ class QuantizationConfig(ABC): @staticmethod def get_from_keys_or(config: dict[str, Any], keys: list[str], default: Any) -> Any: - """Get a optional value from the model's quantization config.""" + """Get an optional value from the model's quantization config.""" try: return QuantizationConfig.get_from_keys(config, keys) except ValueError: diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index b4c9aae254ea0..9696b6c0913c4 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -401,7 +401,7 @@ M = TypeVar("M", bound=MLACommonMetadata) def use_flashinfer_prefill() -> bool: - # For blackwell default to flashinfer prefill if its available since + # For blackwell default to flashinfer prefill if it's available since # it is faster than FA2. return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL and current_platform.is_device_capability(100)) @@ -1018,7 +1018,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): return layer.weight # we currently do not have quantized bmm's which are needed for - # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform + # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform # the bmm's in 16-bit, the extra memory overhead of this is fairly low kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T assert kv_b_proj_weight.shape == ( From 94866d7c9387e4e71944080fd30497bcb59db399 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 5 Sep 2025 00:06:51 +0800 Subject: [PATCH 89/95] [Misc] Slight improve deepgemm print (#24085) Signed-off-by: Jee Jee Li --- vllm/model_executor/layers/fused_moe/deep_gemm_moe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 7b8467a5a0cf0..c0bfda73eee0d 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -57,13 +57,14 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor, if not _valid_deep_gemm_shape(M, N, K): logger.debug_once( "DeepGemm disabled due to unaligned problem size. " - "M: %s, N: %s, K: %s. M should >= align size " - "and N and K must be multiples of %s." + "M: %s, N: %s, K: %s. M should >= %s " + "and N and K must be multiples of %s. " "This is not an error and we will fall back to triton.", M, N, K, align, + align, ) return False elif N <= 512: From 78336a0c3ee4eb9dba6e37959d926160e91623fd Mon Sep 17 00:00:00 2001 From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com> Date: Fri, 5 Sep 2025 00:49:20 +0800 Subject: [PATCH 90/95] Upgrade FlashInfer to v0.3.0 (#24086) Signed-off-by: Po-Han Huang Co-authored-by: Simon Mo --- docker/Dockerfile | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 6f8ca30ffd31b..b78d7d88f1f83 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -375,7 +375,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with "flashinfer" extra in setup.py -ARG FLASHINFER_GIT_REF="v0.2.14.post1" +ARG FLASHINFER_GIT_REF="v0.3.0" # Flag to control whether to compile FlashInfer AOT kernels # Set to "true" to enable AOT compilation: # docker build --build-arg FLASHINFER_AOT_COMPILE=true ... diff --git a/setup.py b/setup.py index ffe8ec4e79af7..872696b250849 100644 --- a/setup.py +++ b/setup.py @@ -694,7 +694,7 @@ setup( "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.2.14.post1"], + "flashinfer": ["flashinfer-python==0.3.0"], # Optional deps for AMD FP4 quantization support "petit-kernel": ["petit-kernel"], }, From 482e52f56ccd9f7e86654909acc2f99384ae874f Mon Sep 17 00:00:00 2001 From: "Saman A. Pour" Date: Thu, 4 Sep 2025 13:33:43 -0700 Subject: [PATCH 91/95] QWEN3 Coder Fused MoE kernels Optimization configs (#24266) Signed-off-by: Saman Keon --- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ 9 files changed, 1314 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..bdbaf3811c939 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6e17bcd214748 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..aa7610cd75e77 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=640,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..df920e8b39ba8 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..e8fe8ea67f246 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..0baf13cb6a5c5 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..4fc4868eaa85a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..d70adca05e779 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_GB200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..0f5867fea5f89 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=40,N=2560,device_name=NVIDIA_H100,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} From 60b755cbcb1154e519572410f60d0d258eecbce4 Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Thu, 4 Sep 2025 14:25:30 -0700 Subject: [PATCH 92/95] [Misc] Have AsyncLLM `custom_stat_loggers` extend default logger list (#20952) Signed-off-by: Seiji Eicher Signed-off-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Co-authored-by: Nick Hill --- tests/v1/engine/test_async_llm.py | 5 +- tests/v1/metrics/test_engine_logger_apis.py | 83 +++++++++++++++++++++ vllm/v1/engine/async_llm.py | 8 +- vllm/v1/metrics/loggers.py | 12 +-- 4 files changed, 99 insertions(+), 9 deletions(-) create mode 100644 tests/v1/metrics/test_engine_logger_apis.py diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index df04a14af70ce..aca546600d0b5 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -393,7 +393,7 @@ class MockLoggingStatLogger(LoggingStatLogger): async def test_customize_loggers(monkeypatch): """Test that we can customize the loggers. If a customized logger is provided at the init, it should - be used directly. + be added to the default loggers. """ with monkeypatch.context() as m, ExitStack() as after: @@ -410,7 +410,8 @@ async def test_customize_loggers(monkeypatch): stat_loggers = engine.logger_manager.per_engine_logger_dict assert len(stat_loggers) == 1 - assert len(stat_loggers[0]) == 1 + assert len( + stat_loggers[0]) == 2 # LoggingStatLogger + MockLoggingStatLogger stat_loggers[0][0].log.assert_called_once() diff --git a/tests/v1/metrics/test_engine_logger_apis.py b/tests/v1/metrics/test_engine_logger_apis.py new file mode 100644 index 0000000000000..e6a4d0a2a2e8b --- /dev/null +++ b/tests/v1/metrics/test_engine_logger_apis.py @@ -0,0 +1,83 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy + +import pytest + +from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM +from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger + + +class DummyStatLogger: + """ + A dummy stat logger for testing purposes. + Implements the minimal interface expected by StatLoggerManager. + """ + + def __init__(self, vllm_config, engine_idx): + self.vllm_config = vllm_config + self.engine_idx = engine_idx + self.recorded = [] + self.logged = False + self.engine_initialized = False + + def record(self, scheduler_stats, iteration_stats, engine_idx): + self.recorded.append((scheduler_stats, iteration_stats, engine_idx)) + + def log(self): + self.logged = True + + def log_engine_initialized(self): + self.engine_initialized = True + + +@pytest.fixture +def log_stats_enabled_engine_args(): + """ + Shared fixture providing common AsyncEngineArgs configuration + used across multiple tests. + """ + return AsyncEngineArgs( + model="distilbert/distilgpt2", + dtype="half", + disable_log_stats=False, + enforce_eager=True, + ) + + +@pytest.mark.asyncio +async def test_async_llm_replace_default_loggers( + log_stats_enabled_engine_args): + """ + RayPrometheusStatLogger should replace the default PrometheusStatLogger + """ + + engine = AsyncLLM.from_engine_args(log_stats_enabled_engine_args, + stat_loggers=[RayPrometheusStatLogger]) + assert isinstance(engine.logger_manager.prometheus_logger, + RayPrometheusStatLogger) + engine.shutdown() + + +@pytest.mark.asyncio +async def test_async_llm_add_to_default_loggers(log_stats_enabled_engine_args): + """ + It's still possible to use custom stat loggers exclusively by passing + disable_log_stats=True in addition to a list of custom stat loggers. + """ + # Create engine_args with disable_log_stats=True for this test + disabled_log_engine_args = copy.deepcopy(log_stats_enabled_engine_args) + disabled_log_engine_args.disable_log_stats = True + + # Disable default loggers; pass custom stat logger to the constructor + engine = AsyncLLM.from_engine_args(disabled_log_engine_args, + stat_loggers=[DummyStatLogger]) + + assert len(engine.logger_manager.per_engine_logger_dict[0]) == 1 + assert isinstance(engine.logger_manager.per_engine_logger_dict[0][0], + DummyStatLogger) + + # log_stats is still True, since custom stat loggers are used + assert engine.log_stats + + engine.shutdown() diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 2a9fa1fd9172c..d23602eaaffa9 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -98,7 +98,12 @@ class AsyncLLM(EngineClient): self.model_config = vllm_config.model_config self.vllm_config = vllm_config self.log_requests = log_requests - self.log_stats = log_stats + + self.log_stats = log_stats or (stat_loggers is not None) + if not log_stats and stat_loggers is not None: + logger.info( + "AsyncLLM created with log_stats=False and non-empty custom " + "logger list; enabling logging without default stat loggers") if self.model_config.skip_tokenizer_init: self.tokenizer = None @@ -137,6 +142,7 @@ class AsyncLLM(EngineClient): vllm_config=vllm_config, engine_idxs=self.engine_core.engine_ranks_managed, custom_stat_loggers=stat_loggers, + enable_default_loggers=log_stats, ) self.logger_manager.log_engine_initialized() diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 41e07a00564aa..f480344c854f7 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -651,16 +651,16 @@ class StatLoggerManager: vllm_config: VllmConfig, engine_idxs: Optional[list[int]] = None, custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, + enable_default_loggers: bool = True, ): self.engine_idxs = engine_idxs if engine_idxs else [0] - factories: list[StatLoggerFactory] + factories: list[StatLoggerFactory] = [] if custom_stat_loggers is not None: - factories = custom_stat_loggers - else: - factories = [] - if logger.isEnabledFor(logging.INFO): - factories.append(LoggingStatLogger) + factories.extend(custom_stat_loggers) + + if enable_default_loggers and logger.isEnabledFor(logging.INFO): + factories.append(LoggingStatLogger) # engine_idx: StatLogger self.per_engine_logger_dict: dict[int, list[StatLoggerBase]] = {} From adc3ddb4309d4843a5425eed702bb3e1d942fc2d Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Fri, 5 Sep 2025 05:25:45 +0800 Subject: [PATCH 93/95] [Bugfix][Misc] Fix silu_and_mul_nvfp4_quant issue and extract common utils for nvfp4 kernel source files (#23727) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> Signed-off-by: Luka Govedič Co-authored-by: Luka Govedič --- .buildkite/test-pipeline.yaml | 4 +- csrc/dispatch_utils.h | 9 - csrc/ops.h | 3 +- .../activation_nvfp4_quant_fusion_kernels.cu | 212 ++---------- .../fp4/nvfp4_blockwise_moe_kernel.cu | 16 + csrc/quantization/fp4/nvfp4_experts_quant.cu | 310 +++--------------- csrc/quantization/fp4/nvfp4_quant_entry.cu | 18 + csrc/quantization/fp4/nvfp4_quant_kernels.cu | 271 +-------------- csrc/quantization/fp4/nvfp4_utils.cuh | 251 ++++++++++++++ csrc/torch_bindings.cpp | 3 +- .../test_silu_nvfp4_quant_fusion.py | 3 +- 11 files changed, 382 insertions(+), 718 deletions(-) create mode 100644 csrc/quantization/fp4/nvfp4_utils.cuh diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index be7044c41a732..55349e0ac9321 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -666,7 +666,7 @@ steps: # Quantization - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - # - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py + - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py @@ -676,7 +676,7 @@ steps: - pytest -v -s tests/compile/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern - pytest -v -s tests/kernels/moe/test_flashinfer.py - # - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py ##### 1 GPU test ##### ##### multi gpus test ##### diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index 2728aa81f0c9f..995374a50b037 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -52,15 +52,6 @@ #define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__)) -#define AT_DISPATCH_BYTE_CASE(enum_type, ...) \ - AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, byte_t, __VA_ARGS__) - -#define VLLM_DISPATCH_CASE_BYTE_TYPES(...) \ - AT_DISPATCH_BYTE_CASE(at::ScalarType::Byte, __VA_ARGS__) - -#define VLLM_DISPATCH_BYTE_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_BYTE_TYPES(__VA_ARGS__)) - #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__)) diff --git a/csrc/ops.h b/csrc/ops.h index 7a176a5c00322..a288112e21000 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -130,8 +130,7 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input); void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& scale); -#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ - (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) +#ifndef USE_ROCM void silu_and_mul_nvfp4_quant(torch::Tensor& out, torch::Tensor& output_block_scale, torch::Tensor& input, diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu index 9bbeb0334fb9a..b4eb141cb4883 100644 --- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu +++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu @@ -26,164 +26,17 @@ #include "dispatch_utils.h" #include "cuda_utils.h" +#include "nvfp4_utils.cuh" namespace vllm { -// Get type2 from type or vice versa (applied to half and bfloat16) -template -struct TypeConverter { - using Type = half2; -}; // keep for generality - -template <> -struct TypeConverter { - using Type = c10::Half; -}; - -template <> -struct TypeConverter { - using Type = half2; -}; - -template <> -struct TypeConverter<__nv_bfloat162> { - using Type = c10::BFloat16; -}; - -template <> -struct TypeConverter { - using Type = __nv_bfloat162; -}; - -#define ELTS_PER_THREAD 8 - -constexpr int CVT_FP4_ELTS_PER_THREAD = 8; -constexpr int CVT_FP4_SF_VEC_SIZE = 16; - -// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). -inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - uint32_t val; - asm volatile( - "{\n" - ".reg .b8 byte0;\n" - ".reg .b8 byte1;\n" - ".reg .b8 byte2;\n" - ".reg .b8 byte3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" - "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" - "}" - : "=r"(val) - : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]), - "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7])); - return val; -#else - return 0; -#endif -} - -// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). -inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - uint32_t val; - asm volatile( - "{\n" - ".reg .b8 byte0;\n" - ".reg .b8 byte1;\n" - ".reg .b8 byte2;\n" - ".reg .b8 byte3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" - "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" - "}" - : "=r"(val) - : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), - "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y)); - return val; -#else - return 0; -#endif -} - -// Fast reciprocal. -inline __device__ float reciprocal_approximate_ftz(float a) { - float b; - asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a)); - return b; -} - -template -__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, - int numCols, - SFType* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || - CVT_FP4_NUM_THREADS_PER_SF == 2); - - // One pair of threads write one SF to global memory. - // TODO: stage through smem for packed STG.32 - // is it better than STG.8 from 4 threads ? - if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { - // SF vector index (16 elements share one SF in the K dimension). - int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; - int32_t mIdx = rowIdx; - - // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] - // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx] - - int32_t mTileIdx = mIdx / (32 * 4); - // SF vector size 16. - int factor = CVT_FP4_SF_VEC_SIZE * 4; - int32_t numKTiles = (numCols + factor - 1) / factor; - int64_t mTileStride = numKTiles * 32 * 4 * 4; - - int32_t kTileIdx = (kIdx / 4); - int64_t kTileStride = 32 * 4 * 4; - - // M tile layout [32, 4] is column-major. - int32_t outerMIdx = (mIdx % 32); - int64_t outerMStride = 4 * 4; - - int32_t innerMIdx = (mIdx % (32 * 4)) / 32; - int64_t innerMStride = 4; - - int32_t innerKIdx = (kIdx % 4); - int64_t innerKStride = 1; - - // Compute the global offset. - int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + - outerMIdx * outerMStride + innerMIdx * innerMStride + - innerKIdx * innerKStride; - - return reinterpret_cast(SFout) + SFOffset; - } -#endif - return nullptr; -} - -// Define a 16 bytes packed data type. -template -struct PackedVec { - typename TypeConverter::Type elts[4]; -}; - -template <> -struct PackedVec<__nv_fp8_e4m3> { - __nv_fp8x2_e4m3 elts[8]; -}; - template __inline__ __device__ PackedVec compute_silu(PackedVec& vec, PackedVec& vec2) { PackedVec result; #pragma unroll for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { half2 val(0.5f, 0.5f); half2 t0 = __hmul2(vec.elts[i], val); half2 t1 = __hfma2(h2tanh(t0), val, val); @@ -206,13 +59,12 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec& vec, PackedVec& vec2, float SFScaleVal, uint8_t* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) PackedVec out_silu = compute_silu(vec, vec2); // Get absolute maximum values among the local 8 values. auto localMax = __habs2(out_silu.elts[0]); - // Local maximum value. - #pragma unroll +// Local maximum value. +#pragma unroll for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { localMax = __hmax2(localMax, __habs2(out_silu.elts[i])); } @@ -259,9 +111,9 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec& vec, // Convert the input to float. float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2]; - #pragma unroll +#pragma unroll for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { fp2Vals[i] = __half22float2(out_silu.elts[i]); } else { fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]); @@ -275,22 +127,14 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec& vec, // Write the e2m1 values to global memory. return e2m1Vec; -#else - return 0; -#endif } // Use UE4M3 by default. template -__global__ void -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) -__launch_bounds__(1024, 4) silu_and_cvt_fp16_to_fp4( -#else -silu_and_cvt_fp16_to_fp4( -#endif - int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, - uint32_t* out, uint32_t* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) +__global__ void __launch_bounds__(1024, 4) + silu_and_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, + float const* SFScale, uint32_t* out, + uint32_t* SFout) { using PackedVec = PackedVec; static constexpr int CVT_FP4_NUM_THREADS_PER_SF = (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); @@ -328,22 +172,25 @@ silu_and_cvt_fp16_to_fp4( in_vec, in_vec2, SFScaleVal, sf_out); } } -#endif } } // namespace vllm -void silu_and_mul_nvfp4_quant(torch::Tensor& output, // [..., d] - torch::Tensor& output_sf, - torch::Tensor& input, // [..., 2 * d] - torch::Tensor& input_sf) { - TORCH_CHECK(input.dtype() == torch::kFloat16 || - input.dtype() == torch::kBFloat16); +void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d] + torch::Tensor& output_sf, + torch::Tensor& input, // [..., 2 * d] + torch::Tensor& input_sf) { int32_t m = input.size(0); int32_t n = input.size(1) / 2; + TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16."); + TORCH_CHECK(input.scalar_type() == at::ScalarType::Half || + input.scalar_type() == at::ScalarType::BFloat16, + "Unsupported input data type for quantize_to_fp4."); + int multiProcessorCount = get_device_attribute(cudaDevAttrMultiProcessorCount, -1); + auto input_sf_ptr = static_cast(input_sf.data_ptr()); auto sf_out = static_cast(output_sf.data_ptr()); auto output_ptr = static_cast(output.data_ptr()); @@ -352,17 +199,14 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& output, // [..., d] dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024)); int const numBlocksPerSM = 2048 / block.x; dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); + VLLM_DISPATCH_HALF_TYPES( - input.scalar_type(), "act_and_mul_quant_kernel", [&] { - auto input_ptr = reinterpret_cast(input.data_ptr()); - VLLM_DISPATCH_BYTE_TYPES( - output.scalar_type(), "fused_act_and_mul_quant_kernel_nvfp4_type", - [&] { - vllm::silu_and_cvt_fp16_to_fp4 - <<>>( - m, n, input_ptr, input_sf_ptr, - reinterpret_cast(output_ptr), - reinterpret_cast(sf_out)); - }); + input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] { + using cuda_type = vllm::CUDATypeConverter::Type; + auto input_ptr = static_cast(input.data_ptr()); + vllm::silu_and_cvt_fp16_to_fp4<<>>( + m, n, input_ptr, input_sf_ptr, + reinterpret_cast(output_ptr), + reinterpret_cast(sf_out)); }); } diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu index 03db5cc196d59..2c8df6144bf4d 100644 --- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu +++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include #include diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu index 190d66f318a83..ce3ba2c19b9eb 100644 --- a/csrc/quantization/fp4/nvfp4_experts_quant.cu +++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu @@ -1,247 +1,42 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include +#include +#include + #include #include -#include #include +#include "dispatch_utils.h" -template -struct TypeConverter { - using Type = half2; -}; // keep for generality +#include "nvfp4_utils.cuh" -template <> -struct TypeConverter { - using Type = half; -}; - -template <> -struct TypeConverter { - using Type = half2; -}; - -template <> -struct TypeConverter<__nv_bfloat162> { - using Type = __nv_bfloat16; -}; - -template <> -struct TypeConverter<__nv_bfloat16> { - using Type = __nv_bfloat162; -}; - -#define ELTS_PER_THREAD 8 - -constexpr int CVT_FP4_ELTS_PER_THREAD = 8; -constexpr int CVT_FP4_SF_VEC_SIZE = 16; - -// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). -inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - uint32_t val; - asm volatile( - "{\n" - ".reg .b8 byte0;\n" - ".reg .b8 byte1;\n" - ".reg .b8 byte2;\n" - ".reg .b8 byte3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" - "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" - "}" - : "=r"(val) - : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]), - "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7])); - return val; -#else - return 0; -#endif -} - -// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). -inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - uint32_t val; - asm volatile( - "{\n" - ".reg .b8 byte0;\n" - ".reg .b8 byte1;\n" - ".reg .b8 byte2;\n" - ".reg .b8 byte3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" - "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" - "}" - : "=r"(val) - : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), - "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y)); - return val; -#else - return 0; -#endif -} - -// Fast reciprocal. -inline __device__ float reciprocal_approximate_ftz(float a) { - float b; - asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a)); - return b; -} - -template -__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, - int numCols, - SFType* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || - CVT_FP4_NUM_THREADS_PER_SF == 2); - - // One pair of threads write one SF to global memory. - // TODO: stage through smem for packed STG.32 - // is it better than STG.8 from 4 threads ? - if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { - // SF vector index (16 elements share one SF in the K dimension). - int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; - int32_t mIdx = rowIdx; - - // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] - // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx] - - int32_t mTileIdx = mIdx / (32 * 4); - // SF vector size 16. - int factor = CVT_FP4_SF_VEC_SIZE * 4; - int32_t numKTiles = (numCols + factor - 1) / factor; - int64_t mTileStride = numKTiles * 32 * 4 * 4; - - int32_t kTileIdx = (kIdx / 4); - int64_t kTileStride = 32 * 4 * 4; - - // M tile layout [32, 4] is column-major. - int32_t outerMIdx = (mIdx % 32); - int64_t outerMStride = 4 * 4; - - int32_t innerMIdx = (mIdx % (32 * 4)) / 32; - int64_t innerMStride = 4; - - int32_t innerKIdx = (kIdx % 4); - int64_t innerKStride = 1; - - // Compute the global offset. - int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + - outerMIdx * outerMStride + innerMIdx * innerMStride + - innerKIdx * innerKStride; - - return reinterpret_cast(SFout) + SFOffset; - } -#endif - return nullptr; -} - -// Define a 16 bytes packed data type. -template -struct PackedVec { - typename TypeConverter::Type elts[4]; -}; - -template <> -struct PackedVec<__nv_fp8_e4m3> { - __nv_fp8x2_e4m3 elts[8]; -}; - -// Quantizes the provided PackedVec into the uint32_t output -template -__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec& vec, float SFScaleVal, - uint8_t* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - // Get absolute maximum values among the local 8 values. - auto localMax = __habs2(vec.elts[0]); - - // Local maximum value. - #pragma unroll - for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { - localMax = __hmax2(localMax, __habs2(vec.elts[i])); - } - - // Get the absolute maximum among all 16 values (two threads). - localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax); - // Get the final absolute maximum values. - float vecMax = float(__hmax(localMax.x, localMax.y)); - - // Get the SF (max value of the vector / max value of e2m1). - // maximum value of e2m1 = 6.0. - // TODO: use half as compute data type. - float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f)); - // 8 bits representation of the SF. - uint8_t fp8SFVal; - // Write the SF to global memory (STG.8). - if constexpr (UE8M0_SF) { - // Extract the 8 exponent bits from float32. - // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits. - uint32_t tmp = reinterpret_cast(SFValue) >> 23; - fp8SFVal = tmp & 0xff; - // Convert back to fp32. - reinterpret_cast(SFValue) = tmp << 23; - } else { - // Here SFValue is always positive, so E4M3 is the same as UE4M3. - __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue); - reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp; - // Convert back to fp32. - SFValue = float(tmp); - } - // Get the output scale. - // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * - // reciprocal(SFScaleVal)) - float outputScale = - SFValue != 0 ? reciprocal_approximate_ftz( - SFValue * reciprocal_approximate_ftz(SFScaleVal)) - : 0.0f; - - if (SFout) { - // Write the SF to global memory (STG.8). - *SFout = fp8SFVal; - } - - // Convert the input to float. - float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2]; - - #pragma unroll - for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { - if constexpr (std::is_same_v) { - fp2Vals[i] = __half22float2(vec.elts[i]); - } else { - fp2Vals[i] = __bfloat1622float2(vec.elts[i]); - } - fp2Vals[i].x *= outputScale; - fp2Vals[i].y *= outputScale; - } - - // Convert to e2m1 values. - uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals); - - // Write the e2m1 values to global memory. - return e2m1Vec; -#else - return 0; -#endif -} +namespace vllm { // Use UE4M3 by default. template -__global__ void -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) -__launch_bounds__(512, 4) cvt_fp16_to_fp4( -#else -cvt_fp16_to_fp4( -#endif - int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, - uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts, - uint32_t* output_scale_offset_by_experts, int n_experts, bool low_latency) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) +__global__ void __launch_bounds__(512, 4) + cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, + float const* SFScale, uint32_t* out, uint32_t* SFout, + uint32_t* input_offset_by_experts, + uint32_t* output_scale_offset_by_experts, int n_experts, + bool low_latency) { using PackedVec = PackedVec; static constexpr int CVT_FP4_NUM_THREADS_PER_SF = (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); @@ -299,8 +94,8 @@ cvt_fp16_to_fp4( &input_offset_by_experts[chunk_start + 12])); local_offsets[16] = __ldca(&input_offset_by_experts[chunk_start + 16]); - // Check against the 16 loaded offsets - #pragma unroll +// Check against the 16 loaded offsets +#pragma unroll for (int i = 0; i < 16; i++) { if (rowIdx >= local_offsets[i] && rowIdx < local_offsets[i + 1]) { rowIdx_in_expert = rowIdx - local_offsets[i]; @@ -330,21 +125,15 @@ cvt_fp16_to_fp4( out_pos = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } -#endif } // Kernel for LARGE_M_TOPK = true (large m_topk optimized version) template -__global__ void -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) -__launch_bounds__(1024, 4) cvt_fp16_to_fp4( -#else -cvt_fp16_to_fp4( -#endif - int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, - uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts, - uint32_t* output_scale_offset_by_experts, int n_experts) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) +__global__ void __launch_bounds__(1024, 4) + cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, + float const* SFScale, uint32_t* out, uint32_t* SFout, + uint32_t* input_offset_by_experts, + uint32_t* output_scale_offset_by_experts, int n_experts) { using PackedVec = PackedVec; static constexpr int CVT_FP4_NUM_THREADS_PER_SF = (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); @@ -425,7 +214,6 @@ cvt_fp16_to_fp4( out_pos = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } -#endif } template @@ -501,6 +289,8 @@ void quant_impl(void* output, void* output_scale, void* input, } } +} // namespace vllm + /*Quantization entry for fp4 experts quantization*/ #define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor") #define CHECK_CONTIGUOUS(x, m) \ @@ -560,23 +350,17 @@ void scaled_fp4_experts_quant_sm100a( // 4 means 4 fp8 values are packed into one int32 TORCH_CHECK(output_scale.size(1) * 4 == padded_k); - auto in_dtype = input.dtype(); const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(input.get_device()); - if (in_dtype == at::ScalarType::Half) { - quant_impl(output.data_ptr(), output_scale.data_ptr(), - input.data_ptr(), input_global_scale.data_ptr(), - input_offset_by_experts.data_ptr(), - output_scale_offset_by_experts.data_ptr(), m_topk, k, - n_experts, stream); - } else if (in_dtype == at::ScalarType::BFloat16) { - quant_impl<__nv_bfloat16>(output.data_ptr(), output_scale.data_ptr(), - input.data_ptr(), input_global_scale.data_ptr(), - input_offset_by_experts.data_ptr(), - output_scale_offset_by_experts.data_ptr(), m_topk, - k, n_experts, stream); - } else { - TORCH_CHECK(false, "Expected input data type to be half or bfloat16"); - } + + VLLM_DISPATCH_HALF_TYPES( + input.scalar_type(), "nvfp4_experts_quant_kernel", [&] { + using cuda_type = vllm::CUDATypeConverter::Type; + vllm::quant_impl( + output.data_ptr(), output_scale.data_ptr(), input.data_ptr(), + input_global_scale.data_ptr(), input_offset_by_experts.data_ptr(), + output_scale_offset_by_experts.data_ptr(), m_topk, k, n_experts, + stream); + }); } diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu index 1b61bd4519fc3..c2b39e5438805 100644 --- a/csrc/quantization/fp4/nvfp4_quant_entry.cu +++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu @@ -32,6 +32,14 @@ void scaled_fp4_experts_quant_sm100a( torch::Tensor const& output_scale_offset_by_experts); #endif +#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ + (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) +void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, + torch::Tensor& output_sf, + torch::Tensor& input, + torch::Tensor& input_sf); +#endif + void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input, torch::Tensor& output_sf, torch::Tensor const& input_sf) { #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ @@ -54,3 +62,13 @@ void scaled_fp4_experts_quant( TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 experts quantization kernel"); } + +void silu_and_mul_nvfp4_quant(torch::Tensor& output, torch::Tensor& output_sf, + torch::Tensor& input, torch::Tensor& input_sf) { +#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ + (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) + return silu_and_mul_nvfp4_quant_sm1xxa(output, output_sf, input, input_sf); +#endif + TORCH_CHECK_NOT_IMPLEMENTED( + false, "No compiled silu_and_mul nvfp4 quantization kernel"); +} diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu index 4e080de151648..0c1b9ef0664d7 100644 --- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu +++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu @@ -23,245 +23,18 @@ #include #include +#include "dispatch_utils.h" #include "cuda_utils.h" +#include "nvfp4_utils.cuh" -// Get type2 from type or vice versa (applied to half and bfloat16) -template -struct TypeConverter { - using Type = half2; -}; // keep for generality - -template <> -struct TypeConverter { - using Type = half; -}; - -template <> -struct TypeConverter { - using Type = half2; -}; - -template <> -struct TypeConverter<__nv_bfloat162> { - using Type = __nv_bfloat16; -}; - -template <> -struct TypeConverter<__nv_bfloat16> { - using Type = __nv_bfloat162; -}; - -#define ELTS_PER_THREAD 8 - -constexpr int CVT_FP4_ELTS_PER_THREAD = 8; -constexpr int CVT_FP4_SF_VEC_SIZE = 16; - -// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). -inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - uint32_t val; - asm volatile( - "{\n" - ".reg .b8 byte0;\n" - ".reg .b8 byte1;\n" - ".reg .b8 byte2;\n" - ".reg .b8 byte3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" - "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" - "}" - : "=r"(val) - : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]), - "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7])); - return val; -#else - return 0; -#endif -} - -// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). -inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - uint32_t val; - asm volatile( - "{\n" - ".reg .b8 byte0;\n" - ".reg .b8 byte1;\n" - ".reg .b8 byte2;\n" - ".reg .b8 byte3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" - "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" - "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" - "}" - : "=r"(val) - : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), - "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y)); - return val; -#else - return 0; -#endif -} - -// Fast reciprocal. -inline __device__ float reciprocal_approximate_ftz(float a) { - float b; - asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a)); - return b; -} - -template -__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, - int numCols, - SFType* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || - CVT_FP4_NUM_THREADS_PER_SF == 2); - - // One pair of threads write one SF to global memory. - // TODO: stage through smem for packed STG.32 - // is it better than STG.8 from 4 threads ? - if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { - // SF vector index (16 elements share one SF in the K dimension). - int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; - int32_t mIdx = rowIdx; - - // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] - // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx] - - int32_t mTileIdx = mIdx / (32 * 4); - // SF vector size 16. - int factor = CVT_FP4_SF_VEC_SIZE * 4; - int32_t numKTiles = (numCols + factor - 1) / factor; - int64_t mTileStride = numKTiles * 32 * 4 * 4; - - int32_t kTileIdx = (kIdx / 4); - int64_t kTileStride = 32 * 4 * 4; - - // M tile layout [32, 4] is column-major. - int32_t outerMIdx = (mIdx % 32); - int64_t outerMStride = 4 * 4; - - int32_t innerMIdx = (mIdx % (32 * 4)) / 32; - int64_t innerMStride = 4; - - int32_t innerKIdx = (kIdx % 4); - int64_t innerKStride = 1; - - // Compute the global offset. - int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + - outerMIdx * outerMStride + innerMIdx * innerMStride + - innerKIdx * innerKStride; - - return reinterpret_cast(SFout) + SFOffset; - } -#endif - return nullptr; -} - -// Define a 16 bytes packed data type. -template -struct PackedVec { - typename TypeConverter::Type elts[4]; -}; - -template <> -struct PackedVec<__nv_fp8_e4m3> { - __nv_fp8x2_e4m3 elts[8]; -}; - -// Quantizes the provided PackedVec into the uint32_t output -template -__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec& vec, float SFScaleVal, - uint8_t* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) - // Get absolute maximum values among the local 8 values. - auto localMax = __habs2(vec.elts[0]); - - // Local maximum value. - #pragma unroll - for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { - localMax = __hmax2(localMax, __habs2(vec.elts[i])); - } - - // Get the absolute maximum among all 16 values (two threads). - localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax); - // Get the final absolute maximum values. - float vecMax = float(__hmax(localMax.x, localMax.y)); - - // Get the SF (max value of the vector / max value of e2m1). - // maximum value of e2m1 = 6.0. - // TODO: use half as compute data type. - float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f)); - // 8 bits representation of the SF. - uint8_t fp8SFVal; - // Write the SF to global memory (STG.8). - if constexpr (UE8M0_SF) { - // Extract the 8 exponent bits from float32. - // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits. - uint32_t tmp = reinterpret_cast(SFValue) >> 23; - fp8SFVal = tmp & 0xff; - // Convert back to fp32. - reinterpret_cast(SFValue) = tmp << 23; - } else { - // Here SFValue is always positive, so E4M3 is the same as UE4M3. - __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue); - reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp; - // Convert back to fp32. - SFValue = float(tmp); - } - // Get the output scale. - // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * - // reciprocal(SFScaleVal)) - float outputScale = - SFValue != 0 ? reciprocal_approximate_ftz( - SFValue * reciprocal_approximate_ftz(SFScaleVal)) - : 0.0f; - - if (SFout) { - // Write the SF to global memory (STG.8). - *SFout = fp8SFVal; - } - - // Convert the input to float. - float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2]; - - #pragma unroll - for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { - if constexpr (std::is_same_v) { - fp2Vals[i] = __half22float2(vec.elts[i]); - } else { - fp2Vals[i] = __bfloat1622float2(vec.elts[i]); - } - fp2Vals[i].x *= outputScale; - fp2Vals[i].y *= outputScale; - } - - // Convert to e2m1 values. - uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals); - - // Write the e2m1 values to global memory. - return e2m1Vec; -#else - return 0; -#endif -} +namespace vllm { // Use UE4M3 by default. template -__global__ void -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) -__launch_bounds__(512, 4) cvt_fp16_to_fp4( -#else -cvt_fp16_to_fp4( -#endif - int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, - uint32_t* out, uint32_t* SFout) { -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) +__global__ void __launch_bounds__(512, 4) + cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, + float const* SFScale, uint32_t* out, uint32_t* SFout) { using PackedVec = PackedVec; static constexpr int CVT_FP4_NUM_THREADS_PER_SF = (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); @@ -293,7 +66,6 @@ cvt_fp16_to_fp4( cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } } -#endif } template @@ -332,6 +104,8 @@ template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input, int multiProcessorCount, cudaStream_t stream); +} // namespace vllm + void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, torch::Tensor const& input, torch::Tensor const& output_sf, @@ -340,6 +114,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, int32_t n = input.size(1); TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16."); + TORCH_CHECK(input.scalar_type() == at::ScalarType::Half || + input.scalar_type() == at::ScalarType::BFloat16, + "Unsupported input data type for quantize_to_fp4."); int multiProcessorCount = get_device_attribute(cudaDevAttrMultiProcessorCount, -1); @@ -353,24 +130,10 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, // We don't support e8m0 scales at this moment. bool useUE8M0 = false; - switch (input.scalar_type()) { - case torch::kHalf: { - auto input_ptr = reinterpret_cast(input.data_ptr()); - invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out, - useUE8M0, multiProcessorCount, stream); - break; - } - case torch::kBFloat16: { - auto input_ptr = reinterpret_cast<__nv_bfloat16 const*>(input.data_ptr()); - invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out, - useUE8M0, multiProcessorCount, stream); - break; - } - default: { - std::cerr << "Observing: " << input.scalar_type() - << " for the input datatype which is invalid"; - throw std::runtime_error( - "Unsupported input data type for quantize_to_fp4."); - } - } + VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] { + using cuda_type = vllm::CUDATypeConverter::Type; + auto input_ptr = static_cast(input.data_ptr()); + vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, + sf_out, useUE8M0, multiProcessorCount, stream); + }); } diff --git a/csrc/quantization/fp4/nvfp4_utils.cuh b/csrc/quantization/fp4/nvfp4_utils.cuh new file mode 100644 index 0000000000000..48e4959de9793 --- /dev/null +++ b/csrc/quantization/fp4/nvfp4_utils.cuh @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#define ELTS_PER_THREAD 8 + +constexpr int CVT_FP4_ELTS_PER_THREAD = 8; +constexpr int CVT_FP4_SF_VEC_SIZE = 16; + +namespace vllm { + +// Convert PyTorch cpp type to CUDA type +template +struct CUDATypeConverter { + using Type = T; +}; + +template <> +struct CUDATypeConverter { + using Type = half; +}; + +template <> +struct CUDATypeConverter { + using Type = __nv_bfloat16; +}; + +// Get type2 from type or vice versa (applied to half and bfloat16) +template +struct TypeConverter { + using Type = half2; +}; // keep for generality + +template <> +struct TypeConverter { + using Type = half; +}; + +template <> +struct TypeConverter { + using Type = half2; +}; + +template <> +struct TypeConverter<__nv_bfloat162> { + using Type = __nv_bfloat16; +}; + +template <> +struct TypeConverter<__nv_bfloat16> { + using Type = __nv_bfloat162; +}; + +// Define a 16 bytes packed data type. +template +struct PackedVec { + typename TypeConverter::Type elts[4]; +}; + +template <> +struct PackedVec<__nv_fp8_e4m3> { + __nv_fp8x2_e4m3 elts[8]; +}; + +// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t). +inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) { + uint32_t val; + asm volatile( + "{\n" + ".reg .b8 byte0;\n" + ".reg .b8 byte1;\n" + ".reg .b8 byte2;\n" + ".reg .b8 byte3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" + "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" + "}" + : "=r"(val) + : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]), + "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7])); + return val; +} + +// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t). +inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) { + uint32_t val; + asm volatile( + "{\n" + ".reg .b8 byte0;\n" + ".reg .b8 byte1;\n" + ".reg .b8 byte2;\n" + ".reg .b8 byte3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n" + "cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n" + "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" + "}" + : "=r"(val) + : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y), + "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y)); + return val; +} + +// Fast reciprocal. +inline __device__ float reciprocal_approximate_ftz(float a) { + float b; + asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a)); + return b; +} + +template +__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, + int numCols, + SFType* SFout) { + static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || + CVT_FP4_NUM_THREADS_PER_SF == 2); + + // One pair of threads write one SF to global memory. + // TODO: stage through smem for packed STG.32 + // is it better than STG.8 from 4 threads ? + if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { + // SF vector index (16 elements share one SF in the K dimension). + int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; + int32_t mIdx = rowIdx; + + // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] + // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx] + + int32_t mTileIdx = mIdx / (32 * 4); + // SF vector size 16. + int factor = CVT_FP4_SF_VEC_SIZE * 4; + int32_t numKTiles = (numCols + factor - 1) / factor; + int64_t mTileStride = numKTiles * 32 * 4 * 4; + + int32_t kTileIdx = (kIdx / 4); + int64_t kTileStride = 32 * 4 * 4; + + // M tile layout [32, 4] is column-major. + int32_t outerMIdx = (mIdx % 32); + int64_t outerMStride = 4 * 4; + + int32_t innerMIdx = (mIdx % (32 * 4)) / 32; + int64_t innerMStride = 4; + + int32_t innerKIdx = (kIdx % 4); + int64_t innerKStride = 1; + + // Compute the global offset. + int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + + outerMIdx * outerMStride + innerMIdx * innerMStride + + innerKIdx * innerKStride; + + return reinterpret_cast(SFout) + SFOffset; + } + return nullptr; +} + +// Quantizes the provided PackedVec into the uint32_t output +template +__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec& vec, float SFScaleVal, + uint8_t* SFout) { + // Get absolute maximum values among the local 8 values. + auto localMax = __habs2(vec.elts[0]); + +// Local maximum value. +#pragma unroll + for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { + localMax = __hmax2(localMax, __habs2(vec.elts[i])); + } + + // Get the absolute maximum among all 16 values (two threads). + localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax); + // Get the final absolute maximum values. + float vecMax = float(__hmax(localMax.x, localMax.y)); + + // Get the SF (max value of the vector / max value of e2m1). + // maximum value of e2m1 = 6.0. + // TODO: use half as compute data type. + float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f)); + // 8 bits representation of the SF. + uint8_t fp8SFVal; + // Write the SF to global memory (STG.8). + if constexpr (UE8M0_SF) { + // Extract the 8 exponent bits from float32. + // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits. + uint32_t tmp = reinterpret_cast(SFValue) >> 23; + fp8SFVal = tmp & 0xff; + // Convert back to fp32. + reinterpret_cast(SFValue) = tmp << 23; + } else { + // Here SFValue is always positive, so E4M3 is the same as UE4M3. + __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue); + reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp; + // Convert back to fp32. + SFValue = float(tmp); + } + // Get the output scale. + // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * + // reciprocal(SFScaleVal)) + float outputScale = + SFValue != 0 ? reciprocal_approximate_ftz( + SFValue * reciprocal_approximate_ftz(SFScaleVal)) + : 0.0f; + + if (SFout) { + // Write the SF to global memory (STG.8). + *SFout = fp8SFVal; + } + + // Convert the input to float. + float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2]; + +#pragma unroll + for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) { + if constexpr (std::is_same_v) { + fp2Vals[i] = __half22float2(vec.elts[i]); + } else { + fp2Vals[i] = __bfloat1622float2(vec.elts[i]); + } + fp2Vals[i].x *= outputScale; + fp2Vals[i].y *= outputScale; + } + + // Convert to e2m1 values. + uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals); + + // Write the e2m1 values to global memory. + return e2m1Vec; +} + +} // namespace vllm diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 56626a02c0277..b769c09adc0f0 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -115,8 +115,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()"); ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant); -#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ - (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) +#ifndef USE_ROCM ops.def( "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, " "Tensor input, Tensor input_global_scale) -> ()"); diff --git a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py index 4325162ae94a9..969f14cc3fe62 100644 --- a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py +++ b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py @@ -8,8 +8,7 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.platforms import current_platform from vllm.scalar_type import scalar_types -if not (current_platform.has_device_capability(100) - and hasattr(torch.ops._C, "silu_and_mul_nvfp4_quant")): +if not current_platform.has_device_capability(100): pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.", allow_module_level=True) From 886ccbe5bae5ac3562f2b73c7770bba73d2ea34e Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Thu, 4 Sep 2025 14:58:44 -0700 Subject: [PATCH 94/95] [CI/Build] Reduce the number of redundant cases to test for LoRA (#24276) Signed-off-by: Zhuohan Li --- tests/lora/test_layers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 6e2dda464d8eb..891bc75fcdee0 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -60,9 +60,9 @@ DEVICES = ([ # prefill stage(True) or decode stage(False) STAGES = [True, False] -NUM_RANDOM_SEEDS = 6 +NUM_RANDOM_SEEDS = 2 -VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128 +VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 2 @pytest.fixture(autouse=True) From 65e038931d8599dd9ab80ca5b53d5573d5b74fd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Fri, 5 Sep 2025 01:04:12 +0200 Subject: [PATCH 95/95] [Frontend] Skip unnecessary detokenization when token_id is requested (#24236) Signed-off-by: NickLucche --- vllm/entrypoints/openai/serving_chat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 35edd2f85cd07..fff6dcd724ad6 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1419,9 +1419,10 @@ class OpenAIServingChat(OpenAIServing): step_top_logprobs = top_logprobs[i] if step_top_logprobs is None or step_top_logprobs.get( token_id) is None: - token = tokenizer.decode(token_id) if should_return_as_token_id: token = f"token_id:{token_id}" + else: + token = tokenizer.decode(token_id) logprobs_content.append( ChatCompletionLogProbsContent(