From 9bd9294f0e0434190bc4cb3886288572b26e272b Mon Sep 17 00:00:00 2001 From: HWH <67449739+jio-H@users.noreply.github.com> Date: Thu, 14 Aug 2025 00:41:41 +0800 Subject: [PATCH 01/23] [Bugfix] Fix MiniCPMV Image input inference failed (#22813) Signed-off-by: HWH <67449739+jio-H@users.noreply.github.com> Signed-off-by: DarkLight1337 Signed-off-by: Cyrus Leung Co-authored-by: DarkLight1337 Co-authored-by: Cyrus Leung --- vllm/model_executor/models/minicpmv.py | 17 +++++++ vllm/utils/tensor_schema.py | 70 ++++++++++++++++---------- 2 files changed, 60 insertions(+), 27 deletions(-) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 7db3a1bb90b47..88dd1a57626f2 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -85,6 +85,23 @@ class MiniCPMVImagePixelInputs(TensorSchema): - w: Width """ + def _validate_nested_tensors( + self, + value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]], + field_name: str, + expected_shape: tuple[Union[int, str], ...], + dynamic_dims: set[str], + ) -> tuple[int, ...]: + # value[0] is the scaled image, + # and value[1:] is a collection of image slices. + # It is ensured that all slices in the collection + # have the same shape. + if field_name == "pixel_values": + value = value[1:] if len(value) > 1 else value + + return super()._validate_nested_tensors(value, field_name, + expected_shape, dynamic_dims) + type: Literal["pixel_values"] = "pixel_values" # Note that the image size may vary, so we pass it as a list instead of a diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py index 4c3acf0094c74..21d3249fe1547 100644 --- a/vllm/utils/tensor_schema.py +++ b/vllm/utils/tensor_schema.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Annotated, Any, Union, get_args, get_origin, get_type_hints +from typing import (Annotated, Any, Optional, Union, get_args, get_origin, + get_type_hints) import torch @@ -11,9 +12,13 @@ logger = init_logger(__name__) class TensorShape: - def __init__(self, - *dims: Union[int, str], - dynamic_dims: set[str, ...] = None) -> None: + def __init__( + self, + *dims: Union[int, str], + dynamic_dims: Optional[set[str]] = None, + ) -> None: + super().__init__() + self.dims = dims self.dynamic_dims = dynamic_dims if dynamic_dims else set() @@ -44,11 +49,15 @@ class TensorShape: class TensorSchema: - def __init__(self, - *, - validate: bool = True, - resolve_bindings: dict[str, int] = None, - **kwargs: Any) -> None: + def __init__( + self, + *, + validate: bool = True, + resolve_bindings: Optional[dict[str, int]] = None, + **kwargs: Any, + ) -> None: + super().__init__() + self._resolve_bindings = resolve_bindings if resolve_bindings else {} for key, value in kwargs.items(): @@ -57,16 +66,19 @@ class TensorSchema: if validate: self.validate() - def __getitem__(self, item) -> Any: - return getattr(self, item) + def __getitem__(self, key: str) -> Any: + return getattr(self, key) - def get(self, item, default=None) -> Any: - return getattr(self, item, default) + def get(self, key: str, default: Any = None) -> Any: + return getattr(self, key, default) - def _match_shape_with_dynamic(self, actual: tuple[int, ...], - reference: tuple[int, ...], - expected_shape: tuple[Union[int, str], ...], - dynamic_dims: set[str, ...]) -> bool: + def _match_shape_with_dynamic( + self, + actual: tuple[int, ...], + reference: tuple[int, ...], + expected_shape: tuple[Union[int, str], ...], + dynamic_dims: set[str], + ) -> bool: if len(actual) != len(reference) or len(actual) > len(expected_shape): return False @@ -84,10 +96,12 @@ class TensorSchema: return True def _validate_nested_tensors( - self, value: Union[list[torch.Tensor, ...], - tuple[torch.Tensor, ...]], field_name: str, - expected_shape: tuple[Union[int, str], ...], - dynamic_dims: set[str, ...]) -> tuple[int, ...]: + self, + value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]], + field_name: str, + expected_shape: tuple[Union[int, str], ...], + dynamic_dims: set[str], + ) -> tuple[int, ...]: """Validate a list/tuple of tensors and return the actual shape.""" # Ensure all tensors in the list have the same # shape, besides dynamic dimensions @@ -110,12 +124,14 @@ class TensorSchema: # shape = (len(list), *tensor.shape) return (len(value), ) + first.shape - def _validate_tensor_shape_expected(self, actual_shape: tuple[int, ...], - expected_shape: tuple[Union[int, str], - ...], - field_name: str, shape_env: dict[str, - int], - dynamic_dims: set[str, ...]) -> None: + def _validate_tensor_shape_expected( + self, + actual_shape: tuple[int, ...], + expected_shape: tuple[Union[int, str], ...], + field_name: str, + shape_env: dict[str, int], + dynamic_dims: set[str], + ) -> None: """Validate that the actual tensor shape matches the expected shape.""" if len(actual_shape) != len(expected_shape): From c9232d41f433abd1d6f0960bcec020660078d718 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 14 Aug 2025 01:03:05 +0800 Subject: [PATCH 02/23] [CI/Build] Update VLM common tests (#22841) Signed-off-by: DarkLight1337 --- .../multimodal/generation/test_common.py | 16 +--------------- vllm/model_executor/models/minicpmv.py | 19 +------------------ 2 files changed, 2 insertions(+), 33 deletions(-) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 2a65d7e244d71..2919bdbe91bbd 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -561,7 +561,7 @@ VLM_TEST_SETTINGS = { get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner, - # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55 + # FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49 marks=[pytest.mark.skip("HF import fails")], ), "minicpmv_26": VLMTestInfo( @@ -574,8 +574,6 @@ VLM_TEST_SETTINGS = { get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, - # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55 - marks=[pytest.mark.skip("HF import fails")], ), "minimax_vl_01": VLMTestInfo( models=["MiniMaxAI/MiniMax-VL-01"], @@ -611,18 +609,6 @@ VLM_TEST_SETTINGS = { patch_hf_runner=model_utils.ovis_patch_hf_runner, marks=[large_gpu_mark(min_gb=32)], ), - "ovis1_6": VLMTestInfo( - models=["AIDC-AI/Ovis1.6-Llama3.2-3B"], - test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 - img_idx_to_prompt=lambda idx: "\n", # noqa: E501 - max_model_len=4096, - max_num_seqs=2, - dtype="half", - # use sdpa mode for hf runner since ovis2 didn't work with flash_attn - hf_model_kwargs={"llm_attn_implementation": "sdpa"}, - patch_hf_runner=model_utils.ovis_patch_hf_runner, - ), "ovis2": VLMTestInfo( models=["AIDC-AI/Ovis2-1B"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 88dd1a57626f2..47ce771d8c901 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -85,30 +85,13 @@ class MiniCPMVImagePixelInputs(TensorSchema): - w: Width """ - def _validate_nested_tensors( - self, - value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]], - field_name: str, - expected_shape: tuple[Union[int, str], ...], - dynamic_dims: set[str], - ) -> tuple[int, ...]: - # value[0] is the scaled image, - # and value[1:] is a collection of image slices. - # It is ensured that all slices in the collection - # have the same shape. - if field_name == "pixel_values": - value = value[1:] if len(value) > 1 else value - - return super()._validate_nested_tensors(value, field_name, - expected_shape, dynamic_dims) - type: Literal["pixel_values"] = "pixel_values" # Note that the image size may vary, so we pass it as a list instead of a # batched tensor. pixel_values: Annotated[ list[torch.Tensor], - TensorShape("bns", "c", "h", "w"), + TensorShape("bns", "c", "h", "w", dynamic_dims={"h", "w"}), ] tgt_sizes: Annotated[ torch.Tensor, From 12817a8ac7f0e9b70bfd785b1fb54c28966e7935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Wed, 13 Aug 2025 19:35:50 +0200 Subject: [PATCH 03/23] [CI] Fix `tests/v1/e2e/test_kv_sharing_fast_prefill.py` import on test (#22815) Signed-off-by: NickLucche --- tests/v1/e2e/test_kv_sharing_fast_prefill.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index f5a7b9cc276b3..d72e50e5196b8 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -11,7 +11,8 @@ from vllm import LLM, SamplingParams from vllm.config import CompilationConfig, CompilationLevel from vllm.distributed import cleanup_dist_env_and_memory from vllm.forward_context import get_forward_context -from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration +from vllm.model_executor.models.gemma3n_mm import ( + Gemma3nForConditionalGeneration) from vllm.model_executor.models.registry import ModelRegistry from vllm.model_executor.models.utils import extract_layer_index from vllm.sequence import IntermediateTensors @@ -32,12 +33,13 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration): inputs_embeds: Optional[torch.Tensor] = None, **kwargs, ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds, **kwargs) + hidden_states = super().forward(input_ids, positions, + intermediate_tensors, inputs_embeds, + **kwargs) attn_metadata = get_forward_context().attn_metadata # attn_metadata is None during dummy runs if (attn_metadata is not None - and self.cache_config.kv_sharing_fast_prefill): + and self.language_model.cache_config.kv_sharing_fast_prefill): assert isinstance(attn_metadata, dict) # true in V1 # Gemma3n-E2B has 30 layers, with last 20 layers being # cross-decoder layers. Check attention metadata is correct @@ -52,7 +54,7 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration): # Last layer will be a KV sharing layer layer_attn_metadata = attn_metadata[ - self.model.language_model.layers[-1].self_attn.attn.layer_name] + self.language_model.model.layers[-1].self_attn.attn.layer_name] logits_indices_padded = (layer_attn_metadata.logits_indices_padded) assert logits_indices_padded is not None num_logits_indices = layer_attn_metadata.num_logits_indices From b4b78d63170ff0b1e5310c295473109d92ee51c2 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 14 Aug 2025 01:55:25 +0800 Subject: [PATCH 04/23] [CI/Build] Fix param mismatch in `test_eagle_correctness` (#22847) Signed-off-by: DarkLight1337 --- tests/v1/e2e/test_spec_decode.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 599916c0d1cfb..dde95fbe590b3 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -146,7 +146,11 @@ def test_ngram_correctness( marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), ], ids=[ - "qwen3_eagle3", "llama3_eagle", "llama3_eagle3", "llama4_eagle", + # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 + # "qwen3_eagle3", + "llama3_eagle", + "llama3_eagle3", + "llama4_eagle", "llama4_eagle_mm" ]) @pytest.mark.parametrize("attn_backend", From df0e0f023e1be63d259280ccb9caf5547302ad30 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 14 Aug 2025 04:36:28 +0800 Subject: [PATCH 05/23] [CI/Build] Skip gpt_big model test because of broken HF model (#22848) Signed-off-by: Isotr0py --- tests/models/registry.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index d7d20d1f3abf7..eb48c0f6a7738 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -195,7 +195,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder", - {"tiny": "bigcode/tiny_starcoder_py"}), # noqa: E501 + extras={"tiny": "bigcode/tiny_starcoder_py"}, # noqa: E501 + min_transformers_version="4.55.1"), "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M", {"6b": "EleutherAI/gpt-j-6b"}), "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m", From c6cd5ca3d3294f62bf5fad25ada8192ba39249b9 Mon Sep 17 00:00:00 2001 From: kliuae <17350011+kliuae@users.noreply.github.com> Date: Thu, 14 Aug 2025 04:45:03 +0800 Subject: [PATCH 06/23] [ROCm][Bugfix] Fix compilation error in topk softmax fused kernel (#22819) Signed-off-by: kliuae --- csrc/moe/topk_softmax_kernels.cu | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 946c137db6366..99c52ef17d08b 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -423,12 +423,27 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert); } +#ifndef USE_ROCM #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \ - static_assert(WARP_SIZE == 32 || WARP_SIZE == 64, \ - "Unsupported warp size. Only 32 and 64 are supported."); \ + static_assert(WARP_SIZE == 32, \ + "Unsupported warp size. Only 32 is supported for CUDA"); \ topkGatingSoftmaxLauncherHelper( \ gating_output, nullptr, topk_weights, topk_indices, \ token_expert_indices, num_tokens, topk, 0, num_experts, stream); +#else +#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \ + if (WARP_SIZE == 64) { \ + topkGatingSoftmaxLauncherHelper( \ + gating_output, nullptr, topk_weights, topk_indices, \ + token_expert_indices, num_tokens, topk, 0, num_experts, stream); \ + } else if (WARP_SIZE == 32) { \ + topkGatingSoftmaxLauncherHelper( \ + gating_output, nullptr, topk_weights, topk_indices, \ + token_expert_indices, num_tokens, topk, 0, num_experts, stream); \ + } else { \ + assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \ + } +#endif template void topkGatingSoftmaxKernelLauncher( @@ -443,7 +458,9 @@ void topkGatingSoftmaxKernelLauncher( cudaStream_t stream) { static constexpr int WARPS_PER_TB = 4; static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16; +#ifndef USE_ROCM static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8; +#endif switch (num_experts) { case 1: LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); From 4e8614e88bf621d11682b6f387c8640f6c9ad086 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Wed, 13 Aug 2025 17:38:35 -0400 Subject: [PATCH 07/23] Move checklist in PR template (#22852) Signed-off-by: Luka Govedic --- .github/PULL_REQUEST_TEMPLATE.md | 20 ++++++++++++-------- .github/scripts/cleanup_pr_body.sh | 8 ++++---- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index d4aceab4472fa..1b30c1292df85 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,11 +1,5 @@ -# Essential Elements of an Effective PR Description Checklist - -- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". -- [ ] The test plan, such as providing test command. -- [ ] The test results, such as pasting the results comparison before and after, or e2e results -- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model. - -PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED. + +PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED. ## Purpose @@ -15,4 +9,14 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B ## (Optional) Documentation Update +--- +
+ Essential Elements of an Effective PR Description Checklist + +- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". +- [ ] The test plan, such as providing test command. +- [ ] The test results, such as pasting the results comparison before and after, or e2e results +- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model. +
+ **BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh index 8d65936fba1d8..25af344aab2be 100755 --- a/.github/scripts/cleanup_pr_body.sh +++ b/.github/scripts/cleanup_pr_body.sh @@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}" cp "${OLD}" "${NEW}" -# Remove "FIX #xxxx (*link existing issues this PR will resolve*)" -sed -i '/FIX #xxxx.*$/d' "${NEW}" +# Remove markdown comments (like the at the start) +sed -i '/$/d' "${NEW}" -# Remove "FILL IN THE PR DESCRIPTION HERE" -sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}" +# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED." +sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}" # Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**" sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}" From 31a500c86fb38417a2696d516a8ea1a642a5df06 Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Wed, 13 Aug 2025 14:44:06 -0700 Subject: [PATCH 08/23] [Core] [N-gram SD Optimization][1/n] Propose tokens with a single KMP (#22437) Signed-off-by: Jialin Ouyang --- benchmarks/benchmark_block_pool.py | 74 ++++++++++ benchmarks/benchmark_ngram_proposer.py | 112 +++++++++++++++ benchmarks/benchmark_utils.py | 55 +++++++- benchmarks/kv_cache/benchmark_block_pool.py | 108 --------------- tests/v1/spec_decode/test_ngram.py | 102 +++++++++----- vllm/v1/spec_decode/ngram_proposer.py | 143 ++++++++++++-------- 6 files changed, 388 insertions(+), 206 deletions(-) create mode 100644 benchmarks/benchmark_block_pool.py create mode 100644 benchmarks/benchmark_ngram_proposer.py delete mode 100644 benchmarks/kv_cache/benchmark_block_pool.py diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py new file mode 100644 index 0000000000000..fd363c2ad0514 --- /dev/null +++ b/benchmarks/benchmark_block_pool.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import gc + +from tabulate import tabulate + +from benchmark_utils import TimeCollector +from vllm.utils import FlexibleArgumentParser +from vllm.v1.core.block_pool import BlockPool + + +def main(args): + rows = [] + for allocate_block in args.allocate_blocks: + # Enforce a GC collect ahead to minimize the impact among runs + gc.collect() + block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True) + + get_blocks_times = TimeCollector(TimeCollector.US) + free_blocks_times = TimeCollector(TimeCollector.US) + for _ in range(args.num_iteration): + with get_blocks_times: + blocks = block_pool.get_new_blocks(allocate_block) + with free_blocks_times: + block_pool.free_blocks(blocks) + + rows.append( + [get_blocks_times.cnt, args.num_gpu_blocks, allocate_block] + + get_blocks_times.dump_avg_max() + + free_blocks_times.dump_avg_max() + ) + + print( + tabulate( + rows, + headers=[ + "Iterations", + "Total\nBlocks", + "Allocated\nBlocks", + "Get Blocks\nAvg (us)", + "Get Blocks\nMax (us)", + "Free Blocks\nAvg (us)", + "Free Blocks\nMax (us)", + ], + tablefmt="grid", + floatfmt=".3f", + ) + ) + + +def invoke_main() -> None: + parser = FlexibleArgumentParser( + description="Benchmark the performance of BlockPool for KV Cache." + ) + parser.add_argument("--num-gpu-blocks", type=int, default=100000) + parser.add_argument( + "--num-iteration", + type=int, + default=1000, + help="Number of iterations to run to stablize final data readings", + ) + parser.add_argument( + "--allocate-blocks", + type=int, + nargs="*", + default=[10, 50, 100, 500, 1000], + help="Number of blocks to allocate", + ) + args = parser.parse_args() + main(args) + + +if __name__ == "__main__": + invoke_main() # pragma: no cover diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py new file mode 100644 index 0000000000000..c60040d05ab7a --- /dev/null +++ b/benchmarks/benchmark_ngram_proposer.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import gc + +import numpy as np +from tabulate import tabulate + +from benchmark_utils import TimeCollector +from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig +from vllm.utils import FlexibleArgumentParser +from vllm.v1.spec_decode.ngram_proposer import NgramProposer + + +def main(args): + rows = [] + for max_ngram in args.max_ngram: + collector = TimeCollector(TimeCollector.US) + + model_config = ModelConfig( + model="facebook/opt-125m", + task="generate", + max_model_len=args.num_token + args.num_spec_token, + tokenizer="facebook/opt-125m", + tokenizer_mode="auto", + dtype="auto", + seed=None, + trust_remote_code=False, + ) + proposer = NgramProposer( + vllm_config=VllmConfig( + model_config=model_config, + speculative_config=SpeculativeConfig( + prompt_lookup_min=args.min_ngram, + prompt_lookup_max=max_ngram, + num_speculative_tokens=args.num_spec_token, + method="ngram", + ), + ) + ) + + # Warm up + proposer.propose(np.random.randint(0, 20, (args.num_token,))) + + gc.collect() + for _ in range(args.num_iteration): + tokens = np.random.randint(0, 20, (args.num_req, args.num_token)) + with collector: + for i in range(args.num_req): + proposer.propose(tokens[i, :]) + rows.append( + [args.num_req, args.num_token, args.min_ngram, max_ngram] + + collector.dump_avg_max() + ) + + print( + tabulate( + rows, + headers=[ + "# Request", + "# Token", + "Min Ngram", + "Max Ngram", + "Avg (us)", + "Max (us)", + ], + tablefmt="grid", + floatfmt=".3f", + ) + ) + + +def invoke_main() -> None: + parser = FlexibleArgumentParser( + description="Benchmark the performance of N-gram speculative decode drafting" + ) + parser.add_argument( + "--num-iteration", + type=int, + default=100, + help="Number of iterations to run to stablize final data readings", + ) + parser.add_argument( + "--num-req", type=int, default=128, help="Number of requests in the batch" + ) + parser.add_argument( + "--num-token", type=int, default=1500, help="Number of tokens for each request" + ) + parser.add_argument( + "--min-ngram", + type=int, + default=3, + help="Minimum n-gram to match", + ) + parser.add_argument( + "--max-ngram", + type=int, + nargs="*", + default=[5, 7, 10, 15, 20], + help="Maximum n-gram to match", + ) + parser.add_argument( + "--num-spec-token", + type=int, + default=3, + help="Number of speculative tokens to generate", + ) + args = parser.parse_args() + main(args) + + +if __name__ == "__main__": + invoke_main() # pragma: no cover diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 283f938df50af..98624abdf49fb 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import argparse import json import math import os -from typing import Any +import time +from types import TracebackType +from typing import Any, Optional, Union def convert_to_pytorch_benchmark_format( @@ -72,3 +73,53 @@ def write_to_json(filename: str, records: list) -> None: cls=InfEncoder, default=lambda o: f"<{type(o).__name__} object is not JSON serializable>", ) + + +# Collect time and generate time metrics +# +# Example Usage: +# collector = TimeCollector(TimeCollector.US) +# for _ in range(total_iteration): +# with collector: +# ... +# collector.dump_avg_max() +class TimeCollector: + NS: int = 1 + US: int = NS * 1000 + MS: int = US * 1000 + S: int = MS * 1000 + + def __init__(self, scale: int) -> None: + self.cnt: int = 0 + self._sum: int = 0 + self._max: Optional[int] = None + self.scale = scale + self.start_time: int = time.monotonic_ns() + + def collect(self, v: int) -> None: + self.cnt += 1 + self._sum += v + if self._max is None: + self._max = v + else: + self._max = max(self._max, v) + + def avg(self) -> Union[float, str]: + return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A" + + def max(self) -> Union[float, str]: + return self._max / self.scale if self._max else "N/A" + + def dump_avg_max(self) -> list[Union[float, str]]: + return [self.avg(), self.max()] + + def __enter__(self) -> None: + self.start_time = time.monotonic_ns() + + def __exit__( + self, + exc_type: Optional[type[BaseException]], + exc_value: Optional[BaseException], + exc_traceback: Optional[TracebackType], + ) -> None: + self.collect(time.monotonic_ns() - self.start_time) diff --git a/benchmarks/kv_cache/benchmark_block_pool.py b/benchmarks/kv_cache/benchmark_block_pool.py deleted file mode 100644 index 134551bb61285..0000000000000 --- a/benchmarks/kv_cache/benchmark_block_pool.py +++ /dev/null @@ -1,108 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import gc -import time -from typing import Optional - -from tabulate import tabulate - -from vllm.utils import FlexibleArgumentParser -from vllm.v1.core.block_pool import BlockPool - - -class Metric: - def __init__(self) -> None: - self.cnt: int = 0 - self.sum_v: int = 0 - self.max_v: Optional[int] = None - - def update(self, v: int) -> None: - self.cnt += 1 - self.sum_v += v - if self.max_v is None: - self.max_v = v - else: - self.max_v = max(self.max_v, v) - - def avg_v(self) -> float: - return self.sum_v * 1.0 / self.cnt - - -def main(args): - rows = [] - for allocate_block in args.allocate_blocks: - # Enforce a GC collect ahead to minimize the impact among runs - gc.collect() - block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True) - - get_blocks_metric: Metric = Metric() - free_blocks_metric: Metric = Metric() - for _ in range(args.num_iteration): - t1 = time.monotonic_ns() - blocks = block_pool.get_new_blocks(allocate_block) - t2 = time.monotonic_ns() - block_pool.free_blocks(blocks) - t3 = time.monotonic_ns() - get_blocks_metric.update(t2 - t1) - free_blocks_metric.update(t3 - t2) - - if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None: - rows.append( - [ - get_blocks_metric.cnt, - args.num_gpu_blocks, - allocate_block, - get_blocks_metric.avg_v() / 1000000, - get_blocks_metric.max_v / 1000000.0, - free_blocks_metric.avg_v() / 1000000, - free_blocks_metric.max_v / 1000000.0, - ] - ) - else: - print( - "No valid metrics found." - f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}" - ) - - print( - tabulate( - rows, - headers=[ - "Iterations", - "Total\nBlocks", - "Allocated\nBlocks", - "Get Blocks\nAvg (ms)", - "Get Blocks\nMax (ms)", - "Free Blocks\nAvg (ms)", - "Free Blocks\nMax (ms)", - ], - tablefmt="grid", - floatfmt=".6f", - ) - ) - - -def invoke_main() -> None: - parser = FlexibleArgumentParser( - description="Benchmark the performance of BlockPool for KV Cache." - ) - parser.add_argument("--num-gpu-blocks", type=int, default=100000) - parser.add_argument( - "--num-iteration", - type=int, - default=1000, - help="Number of iterations to run to stablize final data readings", - ) - parser.add_argument( - "--allocate-blocks", - type=int, - nargs="*", - default=[10, 50, 100, 500, 1000], - help="Number of blocks to allocate", - ) - args = parser.parse_args() - main(args) - - -if __name__ == "__main__": - invoke_main() # pragma: no cover diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index b7303e0443d32..4193f4041b32b 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -1,43 +1,63 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import numpy as np from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig -from vllm.v1.spec_decode.ngram_proposer import (NgramProposer, - _find_subarray_kmp, - _kmp_lps_array) +from vllm.v1.spec_decode.ngram_proposer import ( + NgramProposer, _find_longest_matched_ngram_and_propose_tokens) -def test_kmp_lps_array(): - np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([])) - np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0])) - np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])), - np.array([0, 1, 2])) - np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])), - np.array([0, 0, 0, 0])) - np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])), - np.array([0, 0, 1, 2, 0])) +def test_find_longest_matched_ngram_and_propose_tokens(): + tokens = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6]) + assert _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens, + min_ngram=2, + max_ngram=2, + max_model_len=1024, + k=2) is None + tokens = np.array([1, 2, 3, 4, 1, 2, 3]) + np.testing.assert_array_equal( + _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens, + min_ngram=2, + max_ngram=2, + max_model_len=1024, + k=3), + np.array([4, 1, 2])) + np.testing.assert_array_equal( + _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens, + min_ngram=2, + max_ngram=2, + max_model_len=1024, + k=2), np.array([4, 1])) + np.testing.assert_array_equal( + _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens, + min_ngram=1, + max_ngram=1, + max_model_len=1024, + k=3), + np.array([4, 1, 2])) + np.testing.assert_array_equal( + _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens, + min_ngram=1, + max_ngram=1, + max_model_len=1024, + k=2), np.array([4, 1])) -def test_find_subarray_kmp(): - X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6]) - assert _find_subarray_kmp(X, 2, 2) is None - X = np.array([1, 2, 3, 4, 1, 2, 3]) - np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3), - np.array([4, 1, 2])) - np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4, - 1])) - np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3), - np.array([4, 1, 2])) - np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4, - 1])) - X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3]) - np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3), - np.array([4, 1, 2])) + tokens = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3]) + np.testing.assert_array_equal( + _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens, + min_ngram=2, + max_ngram=2, + max_model_len=1024, + k=3), + np.array([4, 1, 2])) # Return on the first match - np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3), - np.array([6, 2, 3])) + np.testing.assert_array_equal( + _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens, + min_ngram=1, + max_ngram=1, + max_model_len=1024, + k=2), np.array([6, 2])) def test_ngram_proposer(): @@ -56,27 +76,35 @@ def test_ngram_proposer(): # No match. result = ngram_proposer( - 2, 2, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 5])) + min_n=2, max_n=2, + k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 5])) assert result is None # No match for 4-gram. result = ngram_proposer( - 4, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3])) + min_n=4, max_n=4, + k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3])) assert result is None # No match for 4-gram but match for 3-gram. result = ngram_proposer( - 3, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3])) + min_n=3, max_n=4, + k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3])) assert np.array_equal(result, np.array([4, 1])) # Match for both 4-gram and 3-gram. # In this case, the proposer should return the 4-gram match. - result = ngram_proposer(3, 4, 2).propose( + result = ngram_proposer(min_n=3, max_n=4, k=2).propose( context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4])) assert np.array_equal(result, np.array([1, 2])) # Not [5, 1] # Match for 2-gram and 3-gram, but not 4-gram. - result = ngram_proposer( - 2, 4, - 2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4])) + result = ngram_proposer(min_n=2, max_n=4, k=2).propose( + context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4])) assert np.array_equal(result, np.array([1, 2])) # Not [5, 2] + + # Multiple 3-gram matched, but always pick the first one. + result = ngram_proposer( + min_n=3, max_n=3, k=2).propose(context_token_ids=np.array( + [1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3])) + assert np.array_equal(result, np.array([100, 1])) diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py index 6b90d0970bd77..fbcf2cb50d371 100644 --- a/vllm/v1/spec_decode/ngram_proposer.py +++ b/vllm/v1/spec_decode/ngram_proposer.py @@ -11,6 +11,10 @@ from vllm.config import VllmConfig class NgramProposer: def __init__(self, vllm_config: VllmConfig): + assert vllm_config.speculative_config is not None + assert vllm_config.speculative_config.prompt_lookup_min is not None + assert vllm_config.speculative_config.prompt_lookup_max is not None + # Minimum length of the n-gram to match. self.min_n = vllm_config.speculative_config.prompt_lookup_min # Maximum length of the n-gram to match. @@ -54,17 +58,13 @@ class NgramProposer: followed that pattern. Here we will return [4,2,3] because we only have three tokens after the match. """ - # Do not generate draft tokens beyond the max model length. - k = min(self.k, self.max_model_len - context_token_ids.shape[0]) - if k <= 0: - return None - # TODO(woosuk): Optimize this. - for n in range(self.max_n, self.min_n - 1, -1): - result = _find_subarray_kmp(context_token_ids, n, k) - if result is not None: - return result - return None + return _find_longest_matched_ngram_and_propose_tokens( + origin_tokens=context_token_ids, + min_ngram=self.min_n, + max_ngram=self.max_n, + max_model_len=self.max_model_len, + k=self.k) def load_model(self, *args, **kwargs): # No model to load. @@ -72,61 +72,86 @@ class NgramProposer: @jit(nopython=True) -def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray: +def _find_longest_matched_ngram_and_propose_tokens( + origin_tokens: np.ndarray, min_ngram: int, max_ngram: int, + max_model_len: int, k: int) -> Optional[np.ndarray]: """ - Build the lps (longest proper prefix which is also suffix) - array for the pattern. + Find the longest n-gram which matches the suffix of the given tokens + whose length is within [min_ngram, max_ngram] (inclusive). + + If found, we will extract k right after the matched ngram. """ - lps = np.zeros(len(pattern), dtype=np.int32) - prev_lps = 0 # length of the previous longest prefix suffix + # Do not generate draft tokens is context is shorter than minimum n-gram + total_token = origin_tokens.shape[0] + if total_token < min_ngram: + return None + + # Do not generate draft tokens beyond the max model length. + k = min(k, max_model_len - total_token) + if k <= 0: + return None + + # Flip tokens, and the goal become to find longest ngram + # on the rightmost position which matches the prefix with + # length [min_n, max_n] (inclusive). + tokens = origin_tokens[::-1] + + # Longest prefix (not including itself) which is a suffix of + # the current position. + # lps[i] = max{v, where tokens[0:v] == tokens[i+1-v:i+1]} + # + # As ngram is capped by max_ngram to save memory, we only need to + # store lps for the first max_ngram prefix. + lps = np.zeros(max_ngram, dtype=np.int32) + + longest_ngram = 0 + position = 0 + + # lps[0] always equal to 0, we starts with index 1 + prev_lps = 0 i = 1 - - while i < len(pattern): - if pattern[i] == pattern[prev_lps]: + while i < total_token: + # tokens[:prev_lps] is the longest prefix as a suffix of tokens[:i] + if tokens[prev_lps] == tokens[i]: + # Token match: tokens[:prev_lps+1] is the longest prefix as + # a suffix of tokens[:i+1] prev_lps += 1 - lps[i] = prev_lps + # Check if we found a longer valid ngram. + # + # Update position when longest_ngram matched prev_lps, + # as we want to get the target n-gram of the earliest position + # in the original tokens (i.e. + # latest position in the reversed tokens) + if prev_lps >= longest_ngram: + longest_ngram = prev_lps + position = i + if i < max_ngram: + # Store LPS for the first max_ngram prefix + lps[i] = prev_lps + if prev_lps == max_ngram: + # When prev_lps reached max_ngram, update prev_lps + # to lps[max_ngram-1] to avoid matching ngram + # longer than max_ngram + prev_lps = lps[max_ngram - 1] i += 1 + elif prev_lps != 0: + # Token mismatch: try the second longest prefix + # among all suffix of tokens[:i], + # which is the longest prefix of tokens[:prev_lps] + prev_lps = lps[prev_lps - 1] else: - if prev_lps != 0: - prev_lps = lps[prev_lps - 1] - else: - lps[i] = 0 - i += 1 - return lps - - -@jit(nopython=True) -def _find_subarray_kmp( - context_token_ids: np.ndarray, - n: int, - k: int, -) -> Optional[np.ndarray]: - context_len = context_token_ids.shape[0] - assert n > 0 - - pattern = context_token_ids[-n:] - # Precompute lps array for Y - lps = _kmp_lps_array(pattern) - - i = 0 - j = 0 - # -n because the last n tokens are used as pattern - while i < context_len - n: - if context_token_ids[i] == pattern[j]: + # Token mismatch, and no more prefix (except empty string) + # as a suffix of tokens[:i] i += 1 - j += 1 - # If we have matched the entire Y - if j == n: - # Found pattern in context, gather the next K elements - return context_token_ids[i:i + k] - else: - # Mismatch - if j != 0: - # Use the lps array to avoid re-checking elements - j = lps[j - 1] - else: - i += 1 + if longest_ngram < min_ngram: + # No valid ngram is found + return None - # Y not found - return None + # Flip the position back, so in origin_tokens, + # origin_tokens[total_token-1-position:total_token-1-position+longest_ngram] + # is the matched ngram, so we should start drafting tokens from + # total_token-1-position+longest_ngram + start_position = total_token - 1 - position + longest_ngram + k = min(k, total_token - start_position) + return origin_tokens[start_position:start_position + k] From 0ca2393b47e72c4424a49aa3b32c7c5d0e378a72 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 14 Aug 2025 06:52:48 +0800 Subject: [PATCH 09/23] [CI/Build] Increase pooling tolerance to pass CI (#22844) Signed-off-by: DarkLight1337 Signed-off-by: mgoin Co-authored-by: mgoin --- tests/models/language/pooling/test_intfloat.py | 2 +- tests/models/language/pooling/test_snowflake_arctic_embed.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py index e48bdbe940be7..6cae53a660ad8 100644 --- a/tests/models/language/pooling/test_intfloat.py +++ b/tests/models/language/pooling/test_intfloat.py @@ -36,7 +36,7 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: - mteb_test_embed_models(hf_runner, vllm_runner, model_info) + mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02) @pytest.mark.parametrize("model_info", MODELS) diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py index 585fa0e683da2..c22c78592e535 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -46,7 +46,7 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None: - mteb_test_embed_models(hf_runner, vllm_runner, model_info) + mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02) @pytest.mark.parametrize("model_info", MODELS) From b6af24fba73cad27254e8826bbd842810cce7ee8 Mon Sep 17 00:00:00 2001 From: Will Eaton Date: Wed, 13 Aug 2025 23:09:07 -0400 Subject: [PATCH 10/23] [CI][Entrypoints]: add filter to generation to filter out invalid tool calls (#22826) Signed-off-by: Will Eaton --- .../entrypoints/openai/test_openai_schema.py | 48 ++++++++++++------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py index 771119d04ea31..246bd014aa690 100644 --- a/tests/entrypoints/openai/test_openai_schema.py +++ b/tests/entrypoints/openai/test_openai_schema.py @@ -54,38 +54,54 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy): op = context.operation assert op is not None - def no_file_type(case: schemathesis.models.Case): + def no_invalid_types(case: schemathesis.models.Case): """ - This filter skips test cases for the `POST /tokenize` endpoint where the - HTTP request body uses `"type": "file"` in any message's content. - We expect these cases to fail because that type isn't implemented here - https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095 + This filter skips test cases with invalid data that schemathesis + incorrectly generates due to permissive schema configurations. + + 1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in + message content, which isn't implemented. + + 2. Skips tool_calls with `"type": "custom"` which schemathesis + incorrectly generates instead of the valid `"type": "function"`. Example test cases that are skipped: curl -X POST -H 'Content-Type: application/json' \ - -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \ + -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \ http://localhost:8000/tokenize curl -X POST -H 'Content-Type: application/json' \ - -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \ - http://localhost:8000/tokenize + -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \ + http://localhost:8000/v1/chat/completions """ # noqa: E501 - if (op.method.lower() == "post" and op.path == "/tokenize" - and hasattr(case, "body") and isinstance(case.body, dict) + if (hasattr(case, "body") and isinstance(case.body, dict) and "messages" in case.body and isinstance(case.body["messages"], list) and len(case.body["messages"]) > 0): + for message in case.body["messages"]: if not isinstance(message, dict): continue - content = message.get("content", []) - if not isinstance(content, list) or len(content) == 0: - continue - if any(item.get("type") == "file" for item in content): - return False + + # Check for invalid file type in tokenize endpoint + if op.method.lower() == "post" and op.path == "/tokenize": + content = message.get("content", []) + if (isinstance(content, list) and len(content) > 0 and any( + item.get("type") == "file" for item in content)): + return False + + # Check for invalid tool_calls with non-function types + tool_calls = message.get("tool_calls", []) + if isinstance(tool_calls, list): + for tool_call in tool_calls: + if isinstance(tool_call, dict): + if tool_call.get("type") != "function": + return False + if "custom" in tool_call: + return False return True - return strategy.filter(no_file_type) + return strategy.filter(no_invalid_types) @schema.parametrize() From 1d20c347179733875faf4c7802ef695ad3dec4f8 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Thu, 14 Aug 2025 05:09:30 +0200 Subject: [PATCH 11/23] [CI] Fix `tests/distributed/test_ca_buffer_sharing.py` (#22849) Signed-off-by: ilmarkov Co-authored-by: ilmarkov Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- vllm/distributed/device_communicators/custom_all_reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 7dd104a4fcc4e..8dfb7959a510d 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -297,7 +297,7 @@ class CustomAllreduce: @staticmethod def free_shared_buffer(pointers: list[int], group: Optional[ProcessGroup] = None, - rank: Optional[int] = 0) -> None: + rank: Optional[int] = None) -> None: if rank is None: rank = dist.get_rank(group=group) if ops is not None: From a353bd083d22c92c90479d6b5b5029c0daed49da Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Thu, 14 Aug 2025 00:41:51 -0400 Subject: [PATCH 12/23] [CI] remove flaky v0 test (#22864) Signed-off-by: Robert Shaw Co-authored-by: Robert Shaw --- tests/entrypoints/openai/test_default_mm_loras.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py index 1fc87c8b42a7a..372e9b1fecd42 100644 --- a/tests/entrypoints/openai/test_default_mm_loras.py +++ b/tests/entrypoints/openai/test_default_mm_loras.py @@ -24,18 +24,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original @pytest.fixture(scope="module") -def monkeypatch_module(): - from _pytest.monkeypatch import MonkeyPatch - mpatch = MonkeyPatch() - yield mpatch - mpatch.undo() - - -@pytest.fixture(scope="module", params=[False, True]) -def multimodal_server(request, monkeypatch_module): # noqa: F811 - - use_v1 = request.param - monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0') +def multimodal_server(): # noqa: F811 args = [ # use half precision for speed and memory savings in CI environment From 00e3f9da462b31f271d9d9fdb526f148572609a9 Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Thu, 14 Aug 2025 00:12:17 -0700 Subject: [PATCH 13/23] vLLM Benchmark suite improvement (#22119) Signed-off-by: Tsai, Louie Signed-off-by: Louie Tsai Co-authored-by: Li, Jiang --- .buildkite/nightly-benchmarks/README.md | 32 ++-- .../scripts/compare-json-results.py | 175 ++++++++++++++++-- .../convert-results-json-to-markdown.py | 161 +++++++++++++++- .../scripts/run-performance-benchmarks.sh | 85 +++++---- .../tests/latency-tests-cpu.json | 4 +- .../tests/serving-tests-cpu-snc2.json | 49 +++-- .../tests/serving-tests-cpu-snc3.json | 52 +++--- .../tests/serving-tests-cpu.json | 30 +-- .../tests/throughput-tests-cpu.json | 4 +- docs/contributing/benchmarks.md | 2 +- 10 files changed, 447 insertions(+), 147 deletions(-) diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index 3f2e2da397977..b39f9899a8f28 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm. - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. -See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. +See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. ## Performance benchmark quick overview @@ -138,28 +138,20 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`. When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`. -`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. +`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. +If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead. -Here is an example using the script to compare result_a and result_b without detail test name. -`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name` - -| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | -|----|----------------------------------------|----------------------------------------|----------| -| 0 | 142.633982 | 156.526018 | 1.097396 | -| 1 | 241.620334 | 294.018783 | 1.216863 | -| 2 | 218.298905 | 262.664916 | 1.203235 | -| 3 | 242.743860 | 299.816190 | 1.235113 | - -Here is an example using the script to compare result_a and result_b with detail test name. +Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps. `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` -| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio | -|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------| -| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 | -| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 | -| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 | -| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 | -| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 | +| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio | +|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------| +| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 | +| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 | + +A comparison diagram will be generated below the table. +Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3 +image ## Nightly test details diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 20c106234935c..12c4ba6aa69a6 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -1,24 +1,38 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse +import json +import os import pandas as pd def compare_data_columns( - files, name_column, data_column, drop_column, ignore_test_name=False + files, name_column, data_column, info_cols, drop_column, debug=False ): print("\ncompare_data_column: " + data_column) frames = [] + raw_data_cols = [] compare_frames = [] for file in files: data_df = pd.read_json(file) serving_df = data_df.dropna(subset=[drop_column], ignore_index=True) - if ignore_test_name is False: + # Show all info columns in the first couple columns + if not frames: + for col in info_cols: + if col not in serving_df.columns: + print(f"Skipping missing column: {col}") + continue + frames.append(serving_df[col]) + # only show test name under debug mode + if debug is True: serving_df = serving_df.rename(columns={name_column: file + "_name"}) frames.append(serving_df[file + "_name"]) + + file = "/".join(file.split("/")[:-1]) serving_df = serving_df.rename(columns={data_column: file}) frames.append(serving_df[file]) + raw_data_cols.append(file) compare_frames.append(serving_df[file]) if len(compare_frames) >= 2: # Compare numbers among two files @@ -27,7 +41,68 @@ def compare_data_columns( compare_frames.pop(1) concat_df = pd.concat(frames, axis=1) - return concat_df + print(raw_data_cols) + return concat_df, raw_data_cols + + +def split_json_by_tp_pp( + input_file: str = "benchmark_results.json", output_root: str = "." +) -> list[str]: + """ + Split a benchmark JSON into separate folders by (TP Size, PP Size). + + Creates: /tp{TP}_pp{PP}/benchmark_results.json + Returns: list of file paths written. + """ + # Load JSON data into DataFrame + with open(input_file, encoding="utf-8") as f: + data = json.load(f) + + # If the JSON is a dict with a list under common keys, use that list + if isinstance(data, dict): + for key in ("results", "serving_results", "benchmarks", "data"): + if isinstance(data.get(key), list): + data = data[key] + break + + df = pd.DataFrame(data) + + # Handle alias column names + rename_map = { + "tp_size": "TP Size", + "tensor_parallel_size": "TP Size", + "pp_size": "PP Size", + "pipeline_parallel_size": "PP Size", + } + df.rename( + columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True + ) + + # Ensure TP/PP columns exist (default to 1 if missing) + if "TP Size" not in df.columns: + df["TP Size"] = 1 + if "PP Size" not in df.columns: + df["PP Size"] = 1 + + # make sure TP/PP are numeric ints with no NaN + df["TP Size"] = ( + pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int) + ) + df["PP Size"] = ( + pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int) + ) + + # Split into separate folders + saved_paths: list[str] = [] + for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False): + folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}") + os.makedirs(folder_name, exist_ok=True) + filepath = os.path.join(folder_name, "benchmark_results.json") + group_df.to_json(filepath, orient="records", indent=2, force_ascii=False) + print(f"Saved: {filepath}") + saved_paths.append(filepath) + + return saved_paths if __name__ == "__main__": @@ -36,31 +111,105 @@ if __name__ == "__main__": "-f", "--file", action="append", type=str, help="input file name" ) parser.add_argument( - "--ignore_test_name", action="store_true", help="ignore_test_name or not" + "--debug", action="store_true", help="show all information for debugging" + ) + parser.add_argument( + "--plot", + action=argparse.BooleanOptionalAction, + default=True, + help="plot perf diagrams or not --no-plot --plot", + ) + parser.add_argument( + "-x", + "--xaxis", + type=str, + default="# of max concurrency.", + help="column name to use as X Axis in comparision graph", ) args = parser.parse_args() - files = args.file - print("comparing : " + ", ".join(files)) drop_column = "P99" name_column = "Test name" + info_cols = [ + "Model", + "Dataset Name", + "Input Len", + "Output Len", + "TP Size", + "PP Size", + "# of max concurrency.", + "qps", + ] data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] html_msgs_for_data_cols = [ "Compare Output Tokens /n", "Median TTFT /n", "Median TPOT /n", ] - ignore_test_name = args.ignore_test_name + + if len(args.file) == 1: + files = split_json_by_tp_pp(args.file[0], output_root="splits") + info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")] + else: + files = args.file + print("comparing : " + ", ".join(files)) + debug = args.debug + plot = args.plot + # For Plot feature, assign y axis from one of info_cols + y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6 with open("perf_comparison.html", "w") as text_file: for i in range(len(data_cols_to_compare)): - output_df = compare_data_columns( + output_df, raw_data_cols = compare_data_columns( files, name_column, data_cols_to_compare[i], + info_cols, drop_column, - ignore_test_name=ignore_test_name, + debug=debug, ) - print(output_df) - html = output_df.to_html() - text_file.write(html_msgs_for_data_cols[i]) - text_file.write(html) + + # For Plot feature, insert y axis from one of info_cols + raw_data_cols.insert(0, info_cols[y_axis_index]) + + filtered_info_cols = info_cols[:-2] + existing_group_cols = [ + c for c in filtered_info_cols if c in output_df.columns + ] + if not existing_group_cols: + raise ValueError( + f"No valid group-by columns " + f"Expected subset: {filtered_info_cols}, " + f"but DataFrame has: {list(output_df.columns)}" + ) + + output_df_sorted = output_df.sort_values(by=existing_group_cols) + output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) + for name, group in output_groups: + html = group.to_html() + text_file.write(html_msgs_for_data_cols[i]) + text_file.write(html) + + if plot is True: + import pandas as pd + import plotly.express as px + + df = group[raw_data_cols] + df_sorted = df.sort_values(by=info_cols[y_axis_index]) + # Melt DataFrame for plotting + df_melted = df_sorted.melt( + id_vars=info_cols[y_axis_index], + var_name="Configuration", + value_name=data_cols_to_compare[i], + ) + title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] + # Create Plotly line chart + fig = px.line( + df_melted, + x=info_cols[y_axis_index], + y=data_cols_to_compare[i], + color="Configuration", + title=title, + markers=True, + ) + # Export to HTML + text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 554256b4bdb8b..496ee6083abde 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -1,17 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse import json import os +import re +import shlex from importlib import util from pathlib import Path +from typing import Any import pandas as pd import psutil from tabulate import tabulate -results_folder = Path("results/") - # latency results and the keys that will be printed into markdown latency_results = [] latency_column_mapping = { @@ -42,14 +44,22 @@ throughput_results_column_mapping = { serving_results = [] serving_column_mapping = { "test_name": "Test name", + "model_id": "Model", + "dataset_name": "Dataset Name", + "input_len": "Input Len", + "output_len": "Output Len", + "tp_size": "TP Size", + "pp_size": "PP Size", + "dtype": "dtype", "gpu_type": "GPU", "completed": "# of req.", + "qps": "qps", "max_concurrency": "# of max concurrency.", "request_throughput": "Tput (req/s)", "total_token_throughput": "Total Token Tput (tok/s)", "output_throughput": "Output Tput (tok/s)", - "total_input_tokens": "Total input tokens", - "total_output_tokens": "Total output tokens", + # "total_input_tokens": "Total input tokens", + # "total_output_tokens": "Total output tokens", "mean_ttft_ms": "Mean TTFT (ms)", "median_ttft_ms": "Median TTFT (ms)", "p99_ttft_ms": "P99 TTFT (ms)", @@ -94,7 +104,104 @@ def get_size_with_unit(bytes, suffix="B"): bytes /= factor +def _coerce(val: str) -> Any: + """Best-effort type coercion from string to Python types.""" + low = val.lower() + if low == "null": + return None + if low == "true": + return True + if low == "false": + return False + # integers + if re.fullmatch(r"[+-]?\d+", val): + try: + return int(val) + except ValueError: + pass + # floats (keep 'inf'/'-inf'/'nan' as strings) + if re.fullmatch(r"[+-]?\d*\.\d+", val): + try: + return float(val) + except ValueError: + pass + return val + + +def parse_client_command(cmd: str) -> dict[str, Any]: + """Parse the client_command shell string into {executable, script, args}.""" + toks = shlex.split(cmd) + if len(toks) < 2: + raise ValueError("client_command must include an executable and a script") + executable, script = toks[0], toks[1] + args: dict[str, Any] = {} + + i = 2 + while i < len(toks): + t = toks[i] + if t.startswith("--"): + # --key=value or --key (value) or boolean flag + if "=" in t: + key, val = t.split("=", 1) + if key == "--metadata": + md = {} + if val: + if "=" in val: + k, v = val.split("=", 1) + md[k] = _coerce(v) + else: + md[val] = True + args[key] = md + else: + args[key] = _coerce(val) + i += 1 + continue + + key = t + + # Special: consume metadata k=v pairs until next --flag + if key == "--metadata": + i += 1 + md = {} + while i < len(toks) and not toks[i].startswith("--"): + pair = toks[i] + if "=" in pair: + k, v = pair.split("=", 1) + md[k] = _coerce(v) + else: + md[pair] = True + i += 1 + args[key] = md + continue + + # Standard: check if next token is a value (not a flag) + if i + 1 < len(toks) and not toks[i + 1].startswith("--"): + args[key] = _coerce(toks[i + 1]) + i += 2 + else: + # lone flag -> True + args[key] = True + i += 1 + else: + # unexpected positional; skip + i += 1 + + return {"executable": executable, "script": script, "args": args} + + if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-r", + "--result", + type=str, + default="results", + help="Folder name for benchmark output results.", + ) + args = parser.parse_args() + results_folder = Path(args.result) + if not results_folder.exists(): + raise FileNotFoundError(f"results folder does not exist: {results_folder}") # collect results for test_file in results_folder.glob("*.json"): with open(test_file) as f: @@ -102,7 +209,6 @@ if __name__ == "__main__": if "serving" in str(test_file): # this result is generated via `vllm bench serve` command - # attach the benchmarking command to raw_result try: with open(test_file.with_suffix(".commands")) as f: @@ -110,12 +216,44 @@ if __name__ == "__main__": except OSError as e: print(e) continue + # Parse Server Command Arg + out: dict[str, Any] = { + "server_command": parse_client_command(command["server_command"]) + } + parse_args = [ + "--tensor-parallel-size", + "--pipeline-parallel-size", + "--dtype", + ] + col_mapping = ["tp_size", "pp_size", "dtype"] + for index, arg in enumerate(parse_args): + if arg in out["server_command"]["args"]: + raw_result.update( + {col_mapping[index]: out["server_command"]["args"][arg]} + ) + # Parse Client Command Arg + out: dict[str, Any] = { + "client_command": parse_client_command(command["client_command"]) + } + parse_args = [ + "--dataset-name", + "--random-input-len", + "--random-output-len", + "--request-rate", + ] + col_mapping = ["dataset_name", "input_len", "output_len", "qps"] + + for index, arg in enumerate(parse_args): + if arg in out["client_command"]["args"]: + raw_result.update( + {col_mapping[index]: out["client_command"]["args"][arg]} + ) + # Add Server, Client command raw_result.update(command) # update the test name of this result raw_result.update({"test_name": test_file.stem}) - # add the result to raw_result serving_results.append(raw_result) continue @@ -205,7 +343,10 @@ if __name__ == "__main__": columns=latency_column_mapping ) if not serving_results.empty: - serving_results = serving_results[list(serving_column_mapping.keys())].rename( + valid_columns = [ + col for col in serving_column_mapping if col in serving_results.columns + ] + serving_results = serving_results[valid_columns].rename( columns=serving_column_mapping ) if not throughput_results.empty: @@ -245,7 +386,9 @@ if __name__ == "__main__": ) # document the result - with open(results_folder / "benchmark_results.md", "w") as f: + md_file = "benchmark_results.md" + json_file = "benchmark_results.json" + with open(results_folder / md_file, "w") as f: results = read_markdown( "../.buildkite/nightly-benchmarks/" + "performance-benchmarks-descriptions.md" @@ -260,7 +403,7 @@ if __name__ == "__main__": f.write(results) # document benchmarking results in json - with open(results_folder / "benchmark_results.json", "w") as f: + with open(results_folder / json_file, "w") as f: results = ( latency_results.to_dict(orient="records") + throughput_results.to_dict(orient="records") diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index 2c57666a81aa3..b1b7d2d77a44d 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -194,9 +194,11 @@ run_latency_tests() { # check if there is enough GPU to run the test tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ];then - if [[ $numa_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + if [ "$ON_CPU" == "1" ]; then + pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size') + world_size=$(($tp*$pp)) + if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then + echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." continue fi else @@ -261,9 +263,11 @@ run_throughput_tests() { # check if there is enough GPU to run the test tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ];then - if [[ $numa_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + if [ "$ON_CPU" == "1" ]; then + pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size') + world_size=$(($tp*$pp)) + if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then + echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." continue fi else @@ -329,12 +333,21 @@ run_serving_tests() { qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') echo "Running over qps list $qps_list" + max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list') + if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then + num_prompts=$(echo "$client_params" | jq -r '.num_prompts') + max_concurrency_list="[$num_prompts]" + fi + max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh') + echo "Running over max concurrency list $max_concurrency_list" # check if there is enough resources to run the test tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') - if [ "$ON_CPU" == "1" ];then - if [[ $numa_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." + if [ "$ON_CPU" == "1" ]; then + pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size') + world_size=$(($tp*$pp)) + if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then + echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." continue fi else @@ -390,35 +403,39 @@ run_serving_tests() { echo "now qps is $qps" fi - new_test_name=$test_name"_qps_"$qps + # iterate over different max_concurrency + for max_concurrency in $max_concurrency_list; do + new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency + echo " new test name $new_test_name" + # pass the tensor parallel size to the client so that it can be displayed + # on the benchmark dashboard + client_command="vllm bench serve \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + --max-concurrency $max_concurrency \ + --metadata "tensor_parallel_size=$tp" \ + $client_args $client_remote_args " - # pass the tensor parallel size to the client so that it can be displayed - # on the benchmark dashboard - client_command="vllm bench serve \ - --save-result \ - --result-dir $RESULTS_FOLDER \ - --result-filename ${new_test_name}.json \ - --request-rate $qps \ - --metadata "tensor_parallel_size=$tp" \ - $client_args $client_remote_args " + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" - echo "Running test case $test_name with qps $qps" - echo "Client command: $client_command" + bash -c "$client_command" - bash -c "$client_command" - - # record the benchmarking commands - jq_output=$(jq -n \ - --arg server "$server_command" \ - --arg client "$client_command" \ - --arg gpu "$gpu_type" \ - '{ - server_command: $server, - client_command: $client, - gpu_type: $gpu - }') - echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + done done # clean up diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json index da93fdd1dbac1..569117aae852d 100644 --- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json @@ -6,7 +6,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "load_format": "dummy", "num_iters_warmup": 5, @@ -20,7 +20,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "load_format": "dummy", "num_iters_warmup": 5, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json index dd0e24edff98d..2d88a0b30c4f8 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json @@ -1,7 +1,8 @@ [ { "test_name": "serving_llama8B_tp1_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -10,7 +11,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -23,17 +24,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp2_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -42,7 +43,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -55,17 +56,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp4_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -74,7 +75,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -87,17 +88,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp1_random_128_128", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -106,7 +107,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -120,19 +121,19 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 1000, "num_prompts": 1000 } }, { "test_name": "serving_llama8B_tp2_random_128_128", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -141,7 +142,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -155,19 +156,19 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 1000, "num_prompts": 1000 } }, { "test_name": "serving_llama8B_tp4_random_128_128", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -176,7 +177,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -190,13 +191,11 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, - "ignore-eos": "", - "max_concurrency": 1000, "num_prompts": 1000 } } diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json index f1bda65a7590b..823abbaa99f86 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json @@ -1,7 +1,8 @@ [ { "test_name": "serving_llama8B_pp1_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -10,7 +11,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "pipeline_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -23,17 +24,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_pp3_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -42,7 +43,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "pipeline_parallel_size": 3, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -55,17 +56,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { - "test_name": "serving_llama8B_tp2pp6_sharegpt", - "qps_list": [1, 4, 16, "inf"], + "test_name": "serving_llama8B_tp2pp3_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -74,7 +75,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "pipeline_parallel_size": 3, "dtype": "bfloat16", @@ -88,17 +89,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_pp1_random_128_128", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -107,7 +108,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "pipeline_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -121,28 +122,28 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 1000, "num_prompts": 1000 } }, { "test_name": "serving_llama8B_pp3_random_128_128", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL:": 1, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "pipeline_parallel_size": 3, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -156,19 +157,19 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 1000, "num_prompts": 1000 } }, { "test_name": "serving_llama8B_tp2pp3_random_128_128", - "qps_list": [1, 4, 16, "inf"], + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -177,7 +178,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "pipeline_parallel_size": 3, "dtype": "bfloat16", @@ -192,13 +193,12 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 1000, "num_prompts": 1000 } } diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json index f150b9abeea45..e21c8df0a9fe9 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -2,6 +2,7 @@ { "test_name": "serving_llama8B_tp1_sharegpt", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -10,7 +11,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -23,17 +24,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp2_sharegpt", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -42,7 +43,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -55,17 +56,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp4_sharegpt", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -74,7 +75,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -87,17 +88,17 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "max_concurrency": 60, "num_prompts": 200 } }, { "test_name": "serving_llama8B_tp4_random_1024_128", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -106,7 +107,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -120,19 +121,19 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 1024, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 100, "num_prompts": 100 } }, { "test_name": "serving_llama8B_pp6_random_1024_128", "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -141,7 +142,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "pipeline_parallel_size": 6, "dtype": "bfloat16", "distributed_executor_backend": "mp", @@ -155,13 +156,12 @@ "load_format": "dummy" }, "client_parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", "random-input-len": 1024, "random-output-len": 128, "ignore-eos": "", - "max_concurrency": 100, "num_prompts": 100 } } diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json index f159c30637d34..48c015aa8403b 100644 --- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json @@ -6,7 +6,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 1, "load_format": "dummy", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -21,7 +21,7 @@ "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "model": "meta-llama/Llama-3.1-8B-Instruct", "tensor_parallel_size": 4, "load_format": "dummy", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md index 0ebd99ba5ae12..2bbed778f3c6a 100644 --- a/docs/contributing/benchmarks.md +++ b/docs/contributing/benchmarks.md @@ -11,7 +11,7 @@ vLLM contains two sets of benchmarks: The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM. -The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai). +The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm). More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). From 7c3a0741c67007dd759f52d07f7aca854628b81b Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 14 Aug 2025 17:35:43 +0800 Subject: [PATCH 14/23] [Bugfix] Fix `PixtralHFImagePixelInputs` dynamic shape check (#22827) Signed-off-by: Isotr0py --- tests/models/multimodal/test_tensor_schema.py | 2 +- vllm/model_executor/models/llava.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py index a4cb1a68833a5..92390d8c2f7ee 100644 --- a/tests/models/multimodal/test_tensor_schema.py +++ b/tests/models/multimodal/test_tensor_schema.py @@ -153,4 +153,4 @@ def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner], if hasattr(model, method_name): getattr(model, method_name)(**mm_kwargs) - vllm_model.apply_model(validate_model_input) + vllm_model.apply_model(validate_model_input) \ No newline at end of file diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 89d2817b57e0e..4927d6b62c6d8 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -72,8 +72,9 @@ class PixtralHFImagePixelInputs(TensorSchema): in which case the data is passed as a list instead of a batched tensor. """ type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral" - pixel_values: Annotated[Union[torch.Tensor, list[torch.Tensor]], - TensorShape("bn", "c", "h", "w")] + pixel_values: Annotated[ + Union[torch.Tensor, list[torch.Tensor]], + TensorShape("bn", "c", "h", "w", dynamic_dims={"h", "w"})] class LlavaImageEmbeddingInputs(TensorSchema): From eb08487b18f4be3cc8dbe6776c2d69d223b3737c Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 14 Aug 2025 03:44:29 -0700 Subject: [PATCH 15/23] [BugFix] Threadsafe close async zmq sockets (#22877) Signed-off-by: Nick Hill Co-authored-by: Isotr0py --- vllm/utils/__init__.py | 24 ++++++++++- vllm/v1/engine/core_client.py | 79 ++++++++++++++++++++++++----------- 2 files changed, 77 insertions(+), 26 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 095829db83944..cae4eecc0deeb 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -709,8 +709,28 @@ class AsyncMicrobatchTokenizer: def cancel_task_threadsafe(task: Task): - if task and not task.done() and not (loop := task.get_loop()).is_closed(): - loop.call_soon_threadsafe(task.cancel) + if task and not task.done(): + run_in_loop(task.get_loop(), task.cancel) + + +def close_sockets(sockets: Sequence[Union[zmq.Socket, zmq.asyncio.Socket]]): + for sock in sockets: + if sock is not None: + sock.close(linger=0) + + +def run_in_loop(loop: AbstractEventLoop, function: Callable, *args): + if in_loop(loop): + function(*args) + elif not loop.is_closed(): + loop.call_soon_threadsafe(function, *args) + + +def in_loop(event_loop: AbstractEventLoop) -> bool: + try: + return asyncio.get_running_loop() == event_loop + except RuntimeError: + return False def make_async( diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 05b4d72608963..5ffa555570a22 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -23,8 +23,8 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.tasks import SupportedTask -from vllm.utils import (cancel_task_threadsafe, get_open_port, - get_open_zmq_inproc_path, make_zmq_socket) +from vllm.utils import (close_sockets, get_open_port, get_open_zmq_inproc_path, + in_loop, make_zmq_socket) from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, ReconfigureDistributedRequest, ReconfigureRankType, @@ -317,7 +317,7 @@ class BackgroundResources: """Used as a finalizer for clean shutdown, avoiding circular reference back to the client object.""" - ctx: Union[zmq.Context] + ctx: zmq.Context # If CoreEngineProcManager, it manages local engines; # if CoreEngineActorManager, it manages all engines. engine_manager: Optional[Union[CoreEngineProcManager, @@ -326,6 +326,8 @@ class BackgroundResources: output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None first_req_send_socket: Optional[zmq.asyncio.Socket] = None + first_req_rcv_socket: Optional[zmq.asyncio.Socket] = None + stats_update_socket: Optional[zmq.asyncio.Socket] = None output_queue_task: Optional[asyncio.Task] = None stats_update_task: Optional[asyncio.Task] = None shutdown_path: Optional[str] = None @@ -343,23 +345,47 @@ class BackgroundResources: if self.coordinator is not None: self.coordinator.close() - cancel_task_threadsafe(self.output_queue_task) - cancel_task_threadsafe(self.stats_update_task) + if isinstance(self.output_socket, zmq.asyncio.Socket): + # Async case. + loop = self.output_socket._get_loop() + asyncio.get_running_loop() + sockets = (self.output_socket, self.input_socket, + self.first_req_send_socket, self.first_req_rcv_socket, + self.stats_update_socket) - # ZMQ context termination can hang if the sockets - # aren't explicitly closed first. - for socket in (self.output_socket, self.input_socket, - self.first_req_send_socket): - if socket is not None: - socket.close(linger=0) + tasks = (self.output_queue_task, self.stats_update_task) - if self.shutdown_path is not None: - # We must ensure that the sync output socket is - # closed cleanly in its own thread. - with self.ctx.socket(zmq.PAIR) as shutdown_sender: - shutdown_sender.connect(self.shutdown_path) - # Send shutdown signal. - shutdown_sender.send(b'') + def close_sockets_and_tasks(): + close_sockets(sockets) + for task in tasks: + if task is not None and not task.done(): + task.cancel() + + if in_loop(loop): + close_sockets_and_tasks() + elif not loop.is_closed(): + loop.call_soon_threadsafe(close_sockets_and_tasks) + else: + # Loop has been closed, try to clean up directly. + del tasks + del close_sockets_and_tasks + close_sockets(sockets) + del self.output_queue_task + del self.stats_update_task + else: + # Sync case. + + # ZMQ context termination can hang if the sockets + # aren't explicitly closed first. + close_sockets((self.output_socket, self.input_socket)) + + if self.shutdown_path is not None: + # We must ensure that the sync output socket is + # closed cleanly in its own thread. + with self.ctx.socket(zmq.PAIR) as shutdown_sender: + shutdown_sender.connect(self.shutdown_path) + # Send shutdown signal. + shutdown_sender.send(b'') def validate_alive(self, frames: Sequence[zmq.Frame]): if len(frames) == 1 and (frames[0].buffer @@ -969,14 +995,19 @@ class DPAsyncMPClient(AsyncMPClient): self.engine_ranks_managed[-1] + 1) async def run_engine_stats_update_task(): - with make_zmq_socket(self.ctx, self.stats_update_address, - zmq.XSUB) as socket, make_zmq_socket( - self.ctx, - self.first_req_sock_addr, - zmq.PAIR, - bind=False) as first_req_rcv_socket: + with (make_zmq_socket(self.ctx, + self.stats_update_address, + zmq.XSUB, + linger=0) as socket, + make_zmq_socket(self.ctx, + self.first_req_sock_addr, + zmq.PAIR, + bind=False, + linger=0) as first_req_rcv_socket): assert isinstance(socket, zmq.asyncio.Socket) assert isinstance(first_req_rcv_socket, zmq.asyncio.Socket) + self.resources.stats_update_socket = socket + self.resources.first_req_rcv_socket = first_req_rcv_socket # Send subscription message. await socket.send(b'\x01') From f4efda821d7f144f5f9478e960b5011578c69bf0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 14 Aug 2025 12:03:49 +0100 Subject: [PATCH 16/23] Remove Phi 4 Flash configuration workaround (#22723) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/transformers_utils/config.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 02ea0814ddefa..d8c964fb2a4a4 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -449,23 +449,6 @@ def get_config( raise e config = _maybe_remap_hf_config_attrs(config) - # Phi4Flash misuses this config as list[int]. Convert it to int and add - # the layer_types list[str] to make it HF compatible - if (config.model_type == "phi4flash"): - # TODO: Remove after the following PR is merged: - # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/6 - if not hasattr(config, "layer_types"): - config.layer_types = [ - "sliding_attention" if i < config.num_hidden_layers // 2 - and i % 2 == 1 else "full_attention" - for i in range(config.num_hidden_layers) - ] - # TODO: Remove after the following PR is merged: - # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/7 - if isinstance(config.sliding_window, list): - config.sliding_window = next( - filter(None, config.sliding_window), None) - elif config_format == ConfigFormat.MISTRAL: # This function loads a params.json config which # should be used when loading models in mistral format From 7655dc3e45e65f39eee9755cda5298e7319240f2 Mon Sep 17 00:00:00 2001 From: iAmir97 <71513472+iAmir97@users.noreply.github.com> Date: Thu, 14 Aug 2025 18:04:18 +0700 Subject: [PATCH 17/23] [Bugfix] Add reset prefix cache for online serving (#22726) Signed-off-by: iAmir97 Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com> Co-authored-by: iAmir97 Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/engine/async_llm_engine.py | 1 + vllm/v1/engine/async_llm.py | 1 + 2 files changed, 2 insertions(+) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index b6ee4105340a1..73726eeab5fc7 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1092,6 +1092,7 @@ class AsyncLLMEngine(EngineClient): self.engine.reset_prefix_cache(device) async def sleep(self, level: int = 1) -> None: + await self.reset_prefix_cache() self.engine.sleep(level) async def wake_up(self, tags: Optional[list[str]] = None) -> None: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a2706327914c5..edc2e235c3c3f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -576,6 +576,7 @@ class AsyncLLM(EngineClient): await self.engine_core.reset_prefix_cache_async() async def sleep(self, level: int = 1) -> None: + await self.reset_prefix_cache() await self.engine_core.sleep_async(level) async def wake_up(self, tags: Optional[list[str]] = None) -> None: From 0783f139603aaf8c204c36e067a88a8ef1ff47e6 Mon Sep 17 00:00:00 2001 From: Daniele <36171005+dtrifiro@users.noreply.github.com> Date: Thu, 14 Aug 2025 13:06:13 +0200 Subject: [PATCH 18/23] [Doc] fix dead link (#22898) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Daniele Trifirò --- docs/getting_started/installation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index f6ecceb85d862..0ee680f5c688c 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -18,7 +18,7 @@ vLLM supports the following hardware platforms: ## Hardware Plugins The backends below live **outside** the main `vllm` repository and follow the -[Hardware-Pluggable RFC](../design/plugin_system.md). +[Hardware-Pluggable RFC](../../design/plugin_system.md). | Accelerator | PyPI / package | Repository | |-------------|----------------|------------| From 540d54ca8d38633b69cc5a2ba99641e6304a7564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Thu, 14 Aug 2025 13:34:34 +0200 Subject: [PATCH 19/23] [CI] Re-enable transcriptions `test_long_audio_request` (#22890) Signed-off-by: NickLucche --- tests/entrypoints/openai/test_transcription_validation.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 28fd02171b954..e103bd206b54c 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -80,9 +80,6 @@ async def test_bad_requests(mary_had_lamb): async def test_long_audio_request(mary_had_lamb, model_name): server_args = ["--enforce-eager"] - if model_name.startswith("openai"): - return - mary_had_lamb.seek(0) audio, sr = librosa.load(mary_had_lamb) # Add small silence after each audio for repeatability in the split process From 829b9a62d0a89872883397ae4b5184048836589f Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 14 Aug 2025 08:28:09 -0400 Subject: [PATCH 20/23] [Perf] Dont create unnecessary pooling params (#22876) Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_model_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a03e860a91c71..8fb9641844fb5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -341,13 +341,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): model_kwargs = dict[str, Any]() num_reqs = self.input_batch.num_reqs - pooling_params = self.input_batch.pooling_metadata.pooling_params - - num_pooling_reqs = len(pooling_params) + num_pooling_reqs = len(self.input_batch.pooling_params) if num_pooling_reqs == 0: return model_kwargs + pooling_params = self.input_batch.pooling_metadata.pooling_params + assert num_pooling_reqs == num_reqs token_type_id_requests = dict[int, Any]() From 92ff41abea130a3217faa54abb89ccc27aef3f06 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 14 Aug 2025 20:28:50 +0800 Subject: [PATCH 21/23] [Model] Modify the gate implementation of glm4_moe (#22832) Signed-off-by: Jee Jee Li --- docs/models/supported_models.md | 2 +- vllm/model_executor/models/glm4_moe.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index dbbbc5122b803..a24fa4bcce333 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -615,7 +615,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | | ✅︎ | ✅︎ | +| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 131c042c3c2db..aff491f9596c3 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -41,7 +41,6 @@ from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, - ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig @@ -118,14 +117,15 @@ class Glm4MoE(nn.Module): if config.hidden_act != "silu": raise ValueError(f"Unsupported activation: {config.hidden_act}. " "Only silu is supported for now.") - - self.gate = ReplicatedLinear(config.hidden_size, - config.n_routed_experts, - bias=False, - quant_config=None, - params_dtype=torch.float32, - prefix=f"{prefix}.gate") - + # NOTE In the transformers implementation, the gate isn't an nn.Linear, + # so we cannot use ReplicatedLinear here. + # See: https://github.com/huggingface/transformers/blob/v4.55.1/src/transformers/models/glm4_moe/modeling_glm4_moe.py#L260 + self.gate = nn.Linear( + config.hidden_size, + config.n_routed_experts, + bias=False, + dtype=torch.float32, + ) self.gate.e_score_correction_bias = nn.Parameter( torch.empty(config.n_routed_experts, dtype=torch.float32)) @@ -181,7 +181,7 @@ class Glm4MoE(nn.Module): if self.n_shared_experts is not None: shared_output = self.shared_experts(hidden_states) - router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32)) + router_logits = self.gate(hidden_states.to(dtype=torch.float32)) final_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits) * self.routed_scaling_factor From 625ccd1c4d1996a144b0167caefd150cf2956437 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Thu, 14 Aug 2025 23:09:27 +0800 Subject: [PATCH 22/23] [Bugfix] Replace custom Encoding class with BatchEncoding in MistralTokenizer (#22786) Signed-off-by: zjy0516 --- vllm/transformers_utils/tokenizers/mistral.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 6ccc636efaf1b..4dd8b2439b3f5 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -2,13 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os -from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any, Optional, Union, cast import huggingface_hub import regex as re from huggingface_hub import HfApi, hf_hub_download +from transformers.tokenization_utils_base import BatchEncoding from vllm.logger import init_logger from vllm.transformers_utils.tokenizer_base import TokenizerBase @@ -27,11 +27,6 @@ if TYPE_CHECKING: logger = init_logger(__name__) -@dataclass -class Encoding: - input_ids: Union[list[int], list[list[int]]] - - def maybe_serialize_tool_calls(request: "ChatCompletionRequest"): # SEE: https://github.com/vllm-project/vllm/pull/9951 # Credits go to: @gcalmettes @@ -359,7 +354,7 @@ class MistralTokenizer(TokenizerBase): # For str, single prompt text else: input_ids = self.encode_one(text, truncation, max_length) - return Encoding(input_ids=input_ids) + return BatchEncoding({"input_ids": input_ids}) def get_vocab(self) -> dict[str, int]: # NB: the dictionary form of the vocabulary collapses token ids that map From dbe298046c8a2528c48cbd2ceca0d074052054c4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 14 Aug 2025 23:09:44 +0800 Subject: [PATCH 23/23] [Bugfix] Fix parsing of `--disable-mm-preprocessor-cache` (#22909) Signed-off-by: DarkLight1337 --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c058001ceb974..dd1072da08447 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -711,7 +711,7 @@ class EngineArgs: "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"]) multimodal_group.add_argument("--disable-mm-preprocessor-cache", - type=bool, + action="store_true", deprecated=True) multimodal_group.add_argument( "--interleave-mm-strings",