From 4b4092499889415c3a3d23dd44f8e616a6c424d1 Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Mon, 1 Dec 2025 20:02:22 -0600 Subject: [PATCH 01/45] [ROCm] Fallback pytorch GELU with tanh approximation to GELU() (#29244) Signed-off-by: Divakar Verma Signed-off-by: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/layers/activation.py | 30 ++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 3471ee327cf8c..7038d0868c7eb 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -159,6 +159,13 @@ class GeluAndMulSparse(CustomOp): self.approximate = approximate if approximate not in ("none", "tanh"): raise ValueError(f"Unknown approximate mode: {approximate}") + if current_platform.is_rocm() and approximate == "tanh": + # TODO:[ROCm] PyTorch native GELU with tanh is unstable with torch.compile + logger.warning_once( + "[ROCm] Pytorch's native GELU with tanh approximation is currently " + "unstable and produces garbage. Fallback to 'none' approximation." + ) + self.approximate = "none" # Sparsity. if activation_sparsity == 0.0: @@ -209,6 +216,12 @@ class GeluAndMul(CustomOp): self.op = torch.ops._C.gelu_and_mul elif approximate == "tanh": self.op = torch.ops._C.gelu_tanh_and_mul + if current_platform.is_rocm() and approximate == "tanh": + logger.warning_once( + "[ROCm] PyTorch's native GELU with tanh approximation is unstable " + "with torch.compile. For native implementation, fallback to 'none' " + "approximation. The custom kernel implementation is unaffected." + ) elif current_platform.is_xpu(): from vllm._ipex_ops import ipex_ops @@ -219,8 +232,12 @@ class GeluAndMul(CustomOp): def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" + # TODO: [ROCm] PyTorch's native GELU with tanh is unstable with torch.compile + approximate = self.approximate + if current_platform.is_rocm() and approximate == "tanh": + approximate = "none" d = x.shape[-1] // 2 - return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] + return F.gelu(x[..., :d], approximate=approximate) * x[..., d:] def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 @@ -522,7 +539,16 @@ _ACTIVATION_REGISTRY = LazyDict( "gelu": lambda: nn.GELU(), "gelu_fast": lambda: FastGELU(), "gelu_new": lambda: NewGELU(), - "gelu_pytorch_tanh": lambda: nn.GELU(approximate="tanh"), + "gelu_pytorch_tanh": lambda: ( + # TODO:[ROCm] PyTorch native GELU with tanh is unstable with torch.compile + logger.warning_once( + "[ROCm] PyTorch's native GELU with tanh approximation is unstable. " + "Falling back to GELU(approximate='none')." + ), + nn.GELU(approximate="none"), + )[1] + if current_platform.is_rocm() + else nn.GELU(approximate="tanh"), "relu": lambda: nn.ReLU(), "relu2": lambda: ReLUSquaredActivation(), "silu": lambda: nn.SiLU(), From fa8804ad9c3f21dad3143418c7d9f40190f609ae Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Mon, 1 Dec 2025 18:11:35 -0800 Subject: [PATCH 02/45] [responsesAPI][4] fix responseOutputItem Kimi K2 thinking bug (#29555) Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia --- tests/entrypoints/test_responses_utils.py | 21 +++++++++++++++++++++ vllm/entrypoints/responses_utils.py | 7 ++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py index 893d806b65742..3951bd4840085 100644 --- a/tests/entrypoints/test_responses_utils.py +++ b/tests/entrypoints/test_responses_utils.py @@ -5,6 +5,8 @@ import pytest from openai.types.responses.response_function_tool_call_output_item import ( ResponseFunctionToolCallOutputItem, ) +from openai.types.responses.response_output_message import ResponseOutputMessage +from openai.types.responses.response_output_text import ResponseOutputText from openai.types.responses.response_reasoning_item import ( Content, ResponseReasoningItem, @@ -101,3 +103,22 @@ class TestResponsesUtils: ) with pytest.raises(ValueError): construct_chat_message_with_tool_call(item) + + output_item = ResponseOutputMessage( + id="msg_bf585bbbe3d500e0", + content=[ + ResponseOutputText( + annotations=[], + text="dongyi", + type="output_text", + logprobs=None, + ) + ], + role="assistant", + status="completed", + type="message", + ) + + formatted_item = construct_chat_message_with_tool_call(output_item) + assert formatted_item["role"] == "assistant" + assert formatted_item["content"] == "dongyi" diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py index 07abb80ebc9e3..2e01cb038af85 100644 --- a/vllm/entrypoints/responses_utils.py +++ b/vllm/entrypoints/responses_utils.py @@ -97,13 +97,18 @@ def construct_chat_message_with_tool_call( "role": "assistant", "reasoning": reasoning_content, } + elif isinstance(item, ResponseOutputMessage): + return { + "role": "assistant", + "content": item.content[0].text, + } elif isinstance(item, ResponseFunctionToolCallOutputItem): return ChatCompletionToolMessageParam( role="tool", content=item.output, tool_call_id=item.call_id, ) - elif item.get("type") == "function_call_output": + elif isinstance(item, dict) and item.get("type") == "function_call_output": # Append the function call output as a tool message. return ChatCompletionToolMessageParam( role="tool", From d0cd728907d82a109a165612b8790ddaf5496f59 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Mon, 1 Dec 2025 18:25:05 -0800 Subject: [PATCH 03/45] [Core] Support reseting all running requests' KV while calling `reset_prefix_cache` (#28827) Signed-off-by: Zhuohan Li Signed-off-by: Nick Hill Co-authored-by: Nick Hill --- .../offline_inference/llm_engine_reset_kv.py | 98 +++++++++++++++++++ tests/v1/core/test_reset_prefix_cache_e2e.py | 66 +++++++++++++ tests/v1/core/test_scheduler.py | 31 ++++++ vllm/engine/protocol.py | 2 +- vllm/entrypoints/llm.py | 4 +- vllm/entrypoints/openai/api_server.py | 6 +- vllm/v1/core/sched/async_scheduler.py | 6 ++ vllm/v1/core/sched/interface.py | 8 +- vllm/v1/core/sched/scheduler.py | 77 ++++++++++++--- vllm/v1/engine/async_llm.py | 4 +- vllm/v1/engine/core.py | 4 +- vllm/v1/engine/core_client.py | 22 +++-- vllm/v1/engine/llm_engine.py | 4 +- vllm/v1/request.py | 7 +- vllm/v1/worker/gpu_input_batch.py | 2 + vllm/v1/worker/gpu_model_runner.py | 9 +- 16 files changed, 315 insertions(+), 35 deletions(-) create mode 100644 examples/offline_inference/llm_engine_reset_kv.py create mode 100644 tests/v1/core/test_reset_prefix_cache_e2e.py diff --git a/examples/offline_inference/llm_engine_reset_kv.py b/examples/offline_inference/llm_engine_reset_kv.py new file mode 100644 index 0000000000000..3fbe7fa7545e6 --- /dev/null +++ b/examples/offline_inference/llm_engine_reset_kv.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This file demonstrates preempt requests when using the `LLMEngine` +for processing prompts with various sampling parameters. +""" + +import argparse + +from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams +from vllm.utils.argparse_utils import FlexibleArgumentParser + + +def create_test_prompts() -> list[tuple[str, SamplingParams]]: + """Create a list of test prompts with their sampling parameters.""" + return [ + ( + "A robot may not injure a human being " * 50, + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16 + ), + ), + ( + "A robot may not injure a human being " * 50, + SamplingParams( + temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=16 + ), + ), + ( + "To be or not to be,", + SamplingParams( + temperature=0.8, top_k=5, presence_penalty=0.2, max_tokens=128 + ), + ), + ( + "What is the meaning of life?", + SamplingParams( + n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1, max_tokens=128 + ), + ), + ] + + +def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]): + """Continuously process a list of prompts and handle the outputs.""" + request_id = 0 + + print("-" * 50) + step_id = 0 + while test_prompts or engine.has_unfinished_requests(): + print("-" * 50) + import os + + print(f"Step {step_id} (pid={os.getpid()})") + + if test_prompts: + prompt, sampling_params = test_prompts.pop(0) + engine.add_request(str(request_id), prompt, sampling_params) + request_id += 1 + + if step_id == 10: + print(f"Resetting prefix cache at {step_id}") + engine.reset_prefix_cache(reset_running_requests=True) + + request_outputs: list[RequestOutput] = engine.step() + + for request_output in request_outputs: + if request_output.finished: + print("-" * 50) + print(request_output) + print("-" * 50) + step_id += 1 + + +def initialize_engine(args: argparse.Namespace) -> LLMEngine: + """Initialize the LLMEngine from the command line arguments.""" + engine_args = EngineArgs.from_cli_args(args) + return LLMEngine.from_engine_args(engine_args) + + +def parse_args(): + parser = FlexibleArgumentParser( + description="Demo on using the LLMEngine class directly" + ) + parser = EngineArgs.add_cli_args(parser) + return parser.parse_args() + + +def main(args: argparse.Namespace): + """Main function that sets up and runs the prompt processing.""" + engine = initialize_engine(args) + test_prompts = create_test_prompts() + process_requests(engine, test_prompts) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/tests/v1/core/test_reset_prefix_cache_e2e.py b/tests/v1/core/test_reset_prefix_cache_e2e.py new file mode 100644 index 0000000000000..e543c30a156ec --- /dev/null +++ b/tests/v1/core/test_reset_prefix_cache_e2e.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm import EngineArgs, LLMEngine, SamplingParams + +PROMPTS = [ + "A robot may not injure a human being ", + "To be or not to be,", + "What is the meaning of life?", + "What does the fox say? " * 20, # Test long prompt +] + + +def test_reset_prefix_cache_e2e(): + engine_args = EngineArgs( + model="Qwen/Qwen3-0.6B", + gpu_memory_utilization=0.2, + async_scheduling=True, + max_num_batched_tokens=32, + max_model_len=2048, + compilation_config={"mode": 0}, + ) + engine = LLMEngine.from_engine_args(engine_args) + sampling_params = SamplingParams( + temperature=0.0, + max_tokens=16, + ) + + # No preempt case: + for i, prompt in enumerate(PROMPTS): + engine.add_request("ground_truth_" + str(i), prompt, sampling_params) + + ground_truth_results = {} + while engine.has_unfinished_requests(): + request_outputs = engine.step() + for request_output in request_outputs: + if request_output.finished: + ground_truth_results[request_output.request_id] = request_output + + # Preempt case: + for i, prompt in enumerate(PROMPTS): + engine.add_request("preempted_" + str(i), prompt, sampling_params) + + step_id = 0 + preempted_results = {} + while engine.has_unfinished_requests(): + if step_id == 10: + engine.reset_prefix_cache(reset_running_requests=True) + + request_outputs = engine.step() + + for request_output in request_outputs: + if request_output.finished: + preempted_results[request_output.request_id] = request_output + step_id += 1 + + for i in range(len(PROMPTS)): + assert ( + ground_truth_results["ground_truth_" + str(i)].outputs[0].text + == preempted_results["preempted_" + str(i)].outputs[0].text + ), ( + f"ground_truth_results['ground_truth_{i}'].outputs[0].text=" + f"{ground_truth_results['ground_truth_' + str(i)].outputs[0].text} " + f"preempted_results['preempted_{i}'].outputs[0].text=" + f"{preempted_results['preempted_' + str(i)].outputs[0].text}" + ) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index fe4153e609971..0051c11d18d85 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -728,6 +728,37 @@ def test_preempt_during_execution(): assert requests[1].output_token_ids[0] == 42 +def test_scheduler_reset_prefix_cache(): + scheduler = create_scheduler(enable_prefix_caching=True) + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + # Initial scheduling, requests should be at the running state now + _ = scheduler.schedule() + + # Verify requests moved from waiting to running + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == len(requests) + for i, request in enumerate(requests): + assert scheduler.running[i] == request + + # Reset prefix cache should fail since there are still running requests + # and they are taking KV cache + assert not scheduler.reset_prefix_cache() + + # Reset prefix cache with reset_running_requests=True. All running requests + # Should be pushed back to the waiting queue and kv cache should be freed + assert scheduler.reset_prefix_cache(reset_running_requests=True) + + # Verify requests moved from running to waiting + assert len(scheduler.waiting) == len(requests) + assert len(scheduler.running) == 0 + + for i, request in enumerate(requests): + assert scheduler.waiting[i] == request + + # Note - these test cases mirror some of those in test_rejection_sampler.py @pytest.mark.parametrize( "spec_tokens,output_tokens,expected", diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index f2b19c845018c..1b6330c9f9b65 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -116,7 +116,7 @@ class EngineClient(ABC): ... @abstractmethod - async def reset_prefix_cache(self) -> None: + async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: """Reset the prefix cache""" ... diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f005605c08d7e..c121fa71f0196 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1492,8 +1492,8 @@ class LLM: def stop_profile(self) -> None: self.llm_engine.stop_profile() - def reset_prefix_cache(self) -> None: - self.llm_engine.reset_prefix_cache() + def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: + return self.llm_engine.reset_prefix_cache(reset_running_requests) def sleep(self, level: int = 1): """ diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 92161f67f1cf0..cdc316b65ba78 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -877,13 +877,15 @@ if envs.VLLM_SERVER_DEV_MODE: return JSONResponse(content=server_info) @router.post("/reset_prefix_cache") - async def reset_prefix_cache(raw_request: Request): + async def reset_prefix_cache( + raw_request: Request, reset_running_requests: bool = Query(default=False) + ): """ Reset the prefix cache. Note that we currently do not check if the prefix cache is successfully reset in the API server. """ logger.info("Resetting prefix cache...") - await engine_client(raw_request).reset_prefix_cache() + await engine_client(raw_request).reset_prefix_cache(reset_running_requests) return Response(status_code=200) @router.post("/reset_mm_cache") diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py index 7916fafdae1fb..df61eebb395e5 100644 --- a/vllm/v1/core/sched/async_scheduler.py +++ b/vllm/v1/core/sched/async_scheduler.py @@ -45,6 +45,12 @@ class AsyncScheduler(Scheduler): request: Request, new_token_ids: list[int], ) -> tuple[list[int], bool]: + if request.discard_latest_async_tokens: + # If the request is force preempted in reset_prefix_cache, we + # should discard the latest async token. + request.discard_latest_async_tokens = False + return [], False + status_before_update = request.status new_token_ids, stopped = super()._update_request_with_output( request, new_token_ids diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py index 88d99d9402821..c2f503ef2354e 100644 --- a/vllm/v1/core/sched/interface.py +++ b/vllm/v1/core/sched/interface.py @@ -152,10 +152,16 @@ class SchedulerInterface(ABC): return self.has_unfinished_requests() or self.has_finished_requests() @abstractmethod - def reset_prefix_cache(self) -> bool: + def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: """Reset the prefix cache for KV cache. This is particularly required when the model weights are live-updated. + + Args: + reset_running_requests: If True, all the running requests will be + preempted and moved to the waiting queue. Otherwise, this method + will only reset the KV prefix cache when there is no running request + taking KV cache. """ raise NotImplementedError diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index c1ead200ba8d6..52b98ef654592 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -347,17 +347,7 @@ class Scheduler(SchedulerInterface): else: preempted_req = self.running.pop() - self.kv_cache_manager.free(preempted_req) - self.encoder_cache_manager.free(preempted_req) - preempted_req.status = RequestStatus.PREEMPTED - preempted_req.num_computed_tokens = 0 - preempted_req.num_preemptions += 1 - if self.log_stats: - preempted_req.record_event( - EngineCoreEventType.PREEMPTED, scheduled_timestamp - ) - - self.waiting.prepend_request(preempted_req) + self._preempt_request(preempted_req, scheduled_timestamp) preempted_reqs.append(preempted_req) if preempted_req == request: # No more request to preempt. Cannot schedule this request. @@ -756,6 +746,30 @@ class Scheduler(SchedulerInterface): self._update_after_schedule(scheduler_output) return scheduler_output + def _preempt_request( + self, + request: Request, + timestamp: float, + ) -> None: + """Preempt a request and put it back to the waiting queue. + + NOTE: The request should be popped from the running queue outside of this + method. + """ + assert request.status == RequestStatus.RUNNING, ( + "Only running requests can be preempted" + ) + self.kv_cache_manager.free(request) + self.encoder_cache_manager.free(request) + request.status = RequestStatus.PREEMPTED + request.num_computed_tokens = 0 + request.num_preemptions += 1 + if self.log_stats: + request.record_event(EngineCoreEventType.PREEMPTED, timestamp) + + # Put the request back to the waiting queue. + self.waiting.prepend_request(request) + def _update_after_schedule( self, scheduler_output: SchedulerOutput, @@ -1362,8 +1376,45 @@ class Scheduler(SchedulerInterface): def has_finished_requests(self) -> bool: return len(self.finished_req_ids) > 0 - def reset_prefix_cache(self) -> bool: - return self.kv_cache_manager.reset_prefix_cache() + def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: + """Reset the KV prefix cache. + + If reset_running_requests is True, all the running requests will be + preempted and moved to the waiting queue. + Otherwise, this method will only reset the KV prefix cache when there + is no running requests taking KV cache. + """ + if reset_running_requests: + # For logging. + timestamp = time.monotonic() + # Invalidate all the current running requests KV's by pushing them to + # the waiting queue. In this case, we can reduce the ref count of all + # the kv blocks to 0 and thus we can make sure the reset is successful. + # Preempt in reverse order so the requests will be added back to the + # running queue in FIFO order. + while self.running: + request = self.running.pop() + self._preempt_request(request, timestamp) + # NOTE(zhuohan): For async scheduling, we need to discard the latest + # output token on the fly to avoid a redundant repetitive output token. + request.num_output_placeholders = 0 + request.discard_latest_async_tokens = True + + # Clear scheduled request ids cache. Since we are forcing preemption + # + resumption in the same step, we must act as if these requests were + # not scheduled in the prior step. They will be flushed from the + # persistent batch in the model runner. + self.prev_step_scheduled_req_ids.clear() + + reset_successful = self.kv_cache_manager.reset_prefix_cache() + if reset_running_requests and not reset_successful: + raise RuntimeError( + "Failed to reset KV cache even when all the running requests are " + "preempted and moved to the waiting queue. This is likely due to " + "the presence of running requests waiting for remote KV transfer, " + "which is not supported yet." + ) + return reset_successful def make_stats( self, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d0708a8a046d1..17a271ca42e26 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -750,8 +750,8 @@ class AsyncLLM(EngineClient): self.input_processor.clear_mm_cache() await self.engine_core.reset_mm_cache_async() - async def reset_prefix_cache(self) -> None: - await self.engine_core.reset_prefix_cache_async() + async def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: + return await self.engine_core.reset_prefix_cache_async(reset_running_requests) async def sleep(self, level: int = 1) -> None: await self.reset_prefix_cache() diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e3a5f51a8fc56..61b8422dd6633 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -483,8 +483,8 @@ class EngineCore: self.model_executor.reset_mm_cache() - def reset_prefix_cache(self): - self.scheduler.reset_prefix_cache() + def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: + return self.scheduler.reset_prefix_cache(reset_running_requests) def sleep(self, level: int = 1): self.model_executor.sleep(level) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 9b440505bd9dc..afa0593921d06 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -138,7 +138,7 @@ class EngineCoreClient(ABC): def reset_mm_cache(self) -> None: raise NotImplementedError - def reset_prefix_cache(self) -> None: + def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: raise NotImplementedError def sleep(self, level: int = 1) -> None: @@ -208,7 +208,9 @@ class EngineCoreClient(ABC): async def reset_mm_cache_async(self) -> None: raise NotImplementedError - async def reset_prefix_cache_async(self) -> None: + async def reset_prefix_cache_async( + self, reset_running_requests: bool = False + ) -> bool: raise NotImplementedError async def sleep_async(self, level: int = 1) -> None: @@ -287,8 +289,8 @@ class InprocClient(EngineCoreClient): def reset_mm_cache(self) -> None: self.engine_core.reset_mm_cache() - def reset_prefix_cache(self) -> None: - self.engine_core.reset_prefix_cache() + def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: + return self.engine_core.reset_prefix_cache(reset_running_requests) def sleep(self, level: int = 1) -> None: self.engine_core.sleep(level) @@ -751,8 +753,8 @@ class SyncMPClient(MPClient): def reset_mm_cache(self) -> None: self.call_utility("reset_mm_cache") - def reset_prefix_cache(self) -> None: - self.call_utility("reset_prefix_cache") + def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: + return self.call_utility("reset_prefix_cache", reset_running_requests) def add_lora(self, lora_request: LoRARequest) -> bool: return self.call_utility("add_lora", lora_request) @@ -955,8 +957,12 @@ class AsyncMPClient(MPClient): async def reset_mm_cache_async(self) -> None: await self.call_utility_async("reset_mm_cache") - async def reset_prefix_cache_async(self) -> None: - await self.call_utility_async("reset_prefix_cache") + async def reset_prefix_cache_async( + self, reset_running_requests: bool = False + ) -> bool: + return await self.call_utility_async( + "reset_prefix_cache", reset_running_requests + ) async def sleep_async(self, level: int = 1) -> None: await self.call_utility_async("sleep", level) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index a3bde7ba8d64d..e7dfc554e76fa 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -329,8 +329,8 @@ class LLMEngine: self.input_processor.clear_mm_cache() self.engine_core.reset_mm_cache() - def reset_prefix_cache(self): - self.engine_core.reset_prefix_cache() + def reset_prefix_cache(self, reset_running_requests: bool = False) -> bool: + return self.engine_core.reset_prefix_cache(reset_running_requests) def sleep(self, level: int = 1): self.engine_core.sleep(level) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 366cdadf5a583..f2dfd2eed03cd 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -93,7 +93,12 @@ class Request: if self.prompt_token_ids is not None else [0] * self.num_prompt_tokens ) - self.num_output_placeholders = 0 # Used in async scheduling. + + # Used in async scheduling. + self.num_output_placeholders = 0 + # Used in forced preemption (reset_prefix_cache) with async scheduling. + self.discard_latest_async_tokens = False + self.spec_token_ids: list[int] = [] self.num_computed_tokens = 0 self.cache_salt: str | None = cache_salt diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index e7991baeaa1b8..516c76a5e4b15 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -482,6 +482,8 @@ class InputBatch: self.generators.pop(req_index, None) self.num_logprobs.pop(req_id, None) self.in_progress_prompt_logprobs_cpu.pop(req_id, None) + if self.prev_req_id_to_index is not None: + self.prev_req_id_to_index.pop(req_id, None) self.has_allowed_token_ids.discard(req_id) if self.allowed_token_ids_mask_cpu_tensor is not None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2218e4f023f92..9eacd2138978b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -774,7 +774,14 @@ class GPUModelRunner( # they will be scheduled again sometime in the future. scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys() cached_req_ids = self.input_batch.req_id_to_index.keys() - unscheduled_req_ids = cached_req_ids - scheduled_req_ids + resumed_req_ids = scheduler_output.scheduled_cached_reqs.resumed_req_ids + # NOTE(zhuohan): cached_req_ids and resumed_req_ids are usually disjoint, + # so `(scheduled_req_ids - resumed_req_ids) == scheduled_req_ids` holds + # apart from the forced-preemption case in reset_prefix_cache. And in + # that case we include the resumed_req_ids in the unscheduled set so + # that they get cleared from the persistent batch before being re-scheduled + # in the normal resumed request path. + unscheduled_req_ids = cached_req_ids - (scheduled_req_ids - resumed_req_ids) # NOTE(woosuk): The persistent batch optimization assumes that # consecutive batches contain mostly the same requests. If batches # have low request overlap (e.g., alternating between two distinct From fc95521ba59f4800c8afa7fcdcebc3cf08dd197c Mon Sep 17 00:00:00 2001 From: Wei Wei Date: Mon, 1 Dec 2025 18:58:44 -0800 Subject: [PATCH 04/45] [Misc] Throw error on unintended access to scheduler_config.max_model_len (#29771) Signed-off-by: Wei Wei --- vllm/config/scheduler.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index ff1ac0e18f324..88f3e62fbd4ed 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -274,3 +274,8 @@ class SchedulerConfig: ) return self + + def __getattribute__(self, name: str) -> Any: + if name == "max_model_len" or name == "is_encoder_decoder": + raise AttributeError(f"{name} is an init-only parameter. ") + return object.__getattribute__(self, name) From 22274b2184d7b1adadd7b244bf3e3ee0cfd71280 Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Mon, 1 Dec 2025 19:21:44 -0800 Subject: [PATCH 05/45] [Misc] Add ReplicaId to Ray metrics (#24267) Signed-off-by: Seiji Eicher Co-authored-by: rongfu.leng <1275177125@qq.com> --- vllm/v1/metrics/ray_wrappers.py | 61 +++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index a319ffb1d2573..4b46669d5d3bf 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -7,37 +7,55 @@ from vllm.v1.metrics.loggers import PrometheusStatLogger from vllm.v1.spec_decode.metrics import SpecDecodingProm try: + from ray import serve as ray_serve from ray.util import metrics as ray_metrics from ray.util.metrics import Metric except ImportError: ray_metrics = None + ray_serve = None import regex as re +def _get_replica_id() -> str | None: + """Get the current Ray Serve replica ID, or None if not in a Serve context.""" + if ray_serve is None: + return None + try: + return ray_serve.get_replica_context().replica_id.unique_id + except ray_serve.exceptions.RayServeException: + return None + + class RayPrometheusMetric: def __init__(self): if ray_metrics is None: raise ImportError("RayPrometheusMetric requires Ray to be installed.") - self.metric: Metric = None + @staticmethod + def _get_tag_keys(labelnames: list[str] | None) -> tuple[str, ...]: + labels = list(labelnames) if labelnames else [] + labels.append("ReplicaId") + return tuple(labels) + def labels(self, *labels, **labelskwargs): + if labels: + # -1 because ReplicaId was added automatically + expected = len(self.metric._tag_keys) - 1 + if len(labels) != expected: + raise ValueError( + "Number of labels must match the number of tag keys. " + f"Expected {expected}, got {len(labels)}" + ) + labelskwargs.update(zip(self.metric._tag_keys, labels)) + + labelskwargs["ReplicaId"] = _get_replica_id() or "" + if labelskwargs: for k, v in labelskwargs.items(): if not isinstance(v, str): labelskwargs[k] = str(v) - self.metric.set_default_tags(labelskwargs) - - if labels: - if len(labels) != len(self.metric._tag_keys): - raise ValueError( - "Number of labels must match the number of tag keys. " - f"Expected {len(self.metric._tag_keys)}, got {len(labels)}" - ) - - self.metric.set_default_tags(dict(zip(self.metric._tag_keys, labels))) - return self @staticmethod @@ -71,10 +89,14 @@ class RayGaugeWrapper(RayPrometheusMetric): # "mostrecent", "all", "sum" do not apply. This logic can be manually # implemented at the observability layer (Prometheus/Grafana). del multiprocess_mode - labelnames_tuple = tuple(labelnames) if labelnames else None + + tag_keys = self._get_tag_keys(labelnames) name = self._get_sanitized_opentelemetry_name(name) + self.metric = ray_metrics.Gauge( - name=name, description=documentation, tag_keys=labelnames_tuple + name=name, + description=documentation, + tag_keys=tag_keys, ) def set(self, value: int | float): @@ -95,10 +117,12 @@ class RayCounterWrapper(RayPrometheusMetric): documentation: str | None = "", labelnames: list[str] | None = None, ): - labelnames_tuple = tuple(labelnames) if labelnames else None + tag_keys = self._get_tag_keys(labelnames) name = self._get_sanitized_opentelemetry_name(name) self.metric = ray_metrics.Counter( - name=name, description=documentation, tag_keys=labelnames_tuple + name=name, + description=documentation, + tag_keys=tag_keys, ) def inc(self, value: int | float = 1.0): @@ -118,13 +142,14 @@ class RayHistogramWrapper(RayPrometheusMetric): labelnames: list[str] | None = None, buckets: list[float] | None = None, ): - labelnames_tuple = tuple(labelnames) if labelnames else None + tag_keys = self._get_tag_keys(labelnames) name = self._get_sanitized_opentelemetry_name(name) + boundaries = buckets if buckets else [] self.metric = ray_metrics.Histogram( name=name, description=documentation, - tag_keys=labelnames_tuple, + tag_keys=tag_keys, boundaries=boundaries, ) From f441d36cee366126dc4f6db5b6ca262c1e0cc20c Mon Sep 17 00:00:00 2001 From: Johnny Yang <24908445+jcyang43@users.noreply.github.com> Date: Mon, 1 Dec 2025 19:22:50 -0800 Subject: [PATCH 06/45] Add missing return in _check_vllm_model_embed_input_ids (#29834) Signed-off-by: Johnny Yang --- vllm/model_executor/models/interfaces_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 2c99fce8d918c..e8d521ec2e8aa 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -76,6 +76,7 @@ def _check_vllm_model_embed_input_ids(model: type[object] | object) -> bool: "this method to `embed_input_ids`." ) model.embed_input_ids = model_get_input_embeddings + return True logger.warning( "The model (%s) is missing the `embed_input_ids` method.", model, From 53bf71b0f0e0fc57ec5879ea3d13cf93a024667a Mon Sep 17 00:00:00 2001 From: Zuyi Zhao Date: Mon, 1 Dec 2025 19:56:39 -0800 Subject: [PATCH 07/45] [Misc] Update conftest for entrypoints/sagemaker test folder (#29799) Signed-off-by: Zuyi Zhao --- tests/entrypoints/sagemaker/conftest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py index ad219eec18b79..1c34d738fa7a3 100644 --- a/tests/entrypoints/sagemaker/conftest.py +++ b/tests/entrypoints/sagemaker/conftest.py @@ -45,7 +45,10 @@ def basic_server_with_lora(smollm2_lora_files): "64", ] - envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"} + envs = { + "VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True", + "SAGEMAKER_ENABLE_STATEFUL_SESSIONS": "True", + } with RemoteOpenAIServer(MODEL_NAME_SMOLLM, args, env_dict=envs) as remote_server: yield remote_server From 81fe3f82af2e30c93ddb106b7a84b7730b982e66 Mon Sep 17 00:00:00 2001 From: usberkeley <150880684+usberkeley@users.noreply.github.com> Date: Tue, 2 Dec 2025 12:48:11 +0800 Subject: [PATCH 08/45] [BugFix] Fix index error in ngram_proposer (#29779) Signed-off-by: Bradley --- tests/v1/spec_decode/test_ngram.py | 34 ++++++++++++++++++++++++++- vllm/v1/spec_decode/ngram_proposer.py | 4 ++-- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index 692c39282c372..6bc412abe8695 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -2,7 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy as np -from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig +from vllm.config import ( + ModelConfig, + SpeculativeConfig, + VllmConfig, +) from vllm.v1.spec_decode.ngram_proposer import ( NgramProposer, _find_longest_matched_ngram_and_propose_tokens, @@ -167,6 +171,34 @@ def test_ngram_proposer(): assert np.array_equal(result[0], np.array([3, 1])) assert np.array_equal(result[1], np.array([])) + # Test non-contiguous indices: requests 0 and 2 need proposals, + # request 1 is in prefill + proposer = get_ngram_proposer(min_n=2, max_n=2, k=2) + max_model_len = 20 + token_ids_cpu = np.zeros((3, max_model_len), dtype=np.int32) + token_ids_cpu[0, :5] = [1, 2, 3, 1, 2] + token_ids_cpu[1, :3] = [4, 5, 6] + token_ids_cpu[2, :5] = [7, 8, 9, 7, 8] + num_tokens_no_spec = np.array([5, 3, 5], dtype=np.int32) + sampled_token_ids = [[2], [], [8]] # Empty list for request 1 simulates prefill + result = proposer.propose( + sampled_token_ids=sampled_token_ids, + req_ids=["0", "1", "2"], + num_tokens_no_spec=num_tokens_no_spec, + token_ids_cpu=token_ids_cpu, + spec_decode_unsupported_reqs=(), + ) + assert len(result) == 3 + assert np.array_equal(result[0], [3, 1]) + assert len(result[1]) == 0 + assert np.array_equal(result[2], [9, 7]) + # Verify internal arrays written to correct indices + assert proposer.valid_ngram_num_drafts[0] == 2 + assert proposer.valid_ngram_num_drafts[1] == 0 + assert proposer.valid_ngram_num_drafts[2] == 2 + assert np.array_equal(proposer.valid_ngram_draft[0, :2], [3, 1]) + assert np.array_equal(proposer.valid_ngram_draft[2, :2], [9, 7]) + # test if 0 threads available: can happen if TP size > CPU count ngram_proposer = get_ngram_proposer(min_n=2, max_n=2, k=2) ngram_proposer.num_numba_thread_available = 0 diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py index 10b3f0aa040e5..1273ca12c3600 100644 --- a/vllm/v1/spec_decode/ngram_proposer.py +++ b/vllm/v1/spec_decode/ngram_proposer.py @@ -196,9 +196,9 @@ def batch_propose_numba( k=k, ) - valid_ngram_num_drafts[i] = drafter_output.shape[0] + valid_ngram_num_drafts[idx] = drafter_output.shape[0] if len(drafter_output): - valid_ngram_draft[i, : drafter_output.shape[0]] = drafter_output + valid_ngram_draft[idx, : drafter_output.shape[0]] = drafter_output @jit(nopython=True) From a690fb5bd6773128fc82bf03f612156b387d5b61 Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Mon, 1 Dec 2025 22:53:27 -0600 Subject: [PATCH 09/45] [CI][ROCm] Fix test_correctness_sliding_window (#29243) Signed-off-by: Divakar Verma Co-authored-by: Cyrus Leung --- tests/v1/e2e/test_correctness_sliding_window.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py index 71b0e86c75c18..b6a78eaa09209 100644 --- a/tests/v1/e2e/test_correctness_sliding_window.py +++ b/tests/v1/e2e/test_correctness_sliding_window.py @@ -5,6 +5,7 @@ from dataclasses import dataclass import pytest from vllm import LLM, SamplingParams +from vllm.platforms import current_platform from ...utils import check_answers, prep_prompts @@ -40,10 +41,17 @@ def test_sliding_window_retrieval( If we tell it upfront which we are going to be looking for, then it answers correctly (mostly). """ + # NOTE: For ROCm, we have to enforce eager mode to use custom kernel + # implementation of GELU with tanh approximation, as PyTorch's native + # implementation is currently unstable with torch.compile and produces garbage. + enforce_eager = current_platform.is_rocm() + test_config = model_config[model] llm = LLM( - model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager + model=model, + disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager, + enforce_eager=enforce_eager, ) sampling_params = SamplingParams(temperature=0.0, max_tokens=100) From e2fbfc955e1bc9f67c349c5a2bde63a3edd86b84 Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Mon, 1 Dec 2025 23:27:46 -0600 Subject: [PATCH 10/45] [CI][AMD] spec_decode:eagle skip FLASH_ATTN for deepseek on ROCm (#29827) Signed-off-by: Divakar Verma --- tests/v1/e2e/test_spec_decode.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index f711715dec0e6..5246ea6517f6c 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -414,7 +414,10 @@ def test_eagle_correctness( ) if attn_backend == "FLASH_ATTN" and current_platform.is_rocm(): - m.setenv("VLLM_ROCM_USE_AITER", "1") + if "deepseek" in model_setup[1].lower(): + pytest.skip("FLASH_ATTN for deepseek not supported on ROCm platform") + else: + m.setenv("VLLM_ROCM_USE_AITER", "1") method, model_name, spec_model_name, tp_size = model_setup max_model_len = 2048 From 653591d5e73b34ffd9186c61e964474bcc4b7c80 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 2 Dec 2025 13:33:37 +0800 Subject: [PATCH 11/45] [Chore] Move tokenizer initialization methods (#29793) Signed-off-by: DarkLight1337 --- benchmarks/benchmark_prefix_caching.py | 2 +- .../benchmark_serving_structured_output.py | 2 +- .../test_dynamic_shapes_compilation.py | 2 +- .../entrypoints/openai/test_chat_template.py | 2 +- .../entrypoints/openai/test_lora_resolvers.py | 2 +- .../openai/test_return_token_ids.py | 2 +- .../openai/test_return_tokens_as_ids.py | 2 +- tests/entrypoints/openai/test_serving_chat.py | 2 +- .../openai/test_token_in_token_out.py | 2 +- tests/entrypoints/openai/test_tokenization.py | 2 +- .../tool_parsers/test_hermes_tool_parser.py | 2 +- .../entrypoints/pooling/embed/test_online.py | 2 +- .../pooling/pooling/test_online.py | 2 +- tests/entrypoints/test_chat_utils.py | 3 +- .../multimodal/processing/test_common.py | 7 +- .../processing/test_tensor_schema.py | 2 +- tests/models/utils.py | 2 +- tests/test_inputs.py | 4 +- tests/tokenizers_/test_basic.py | 3 +- tests/tokenizers_/test_registry.py | 3 +- .../tool_use/test_deepseekv31_tool_parser.py | 2 +- .../tool_use/test_ernie45_moe_tool_parser.py | 3 +- tests/tool_use/test_glm4_moe_tool_parser.py | 2 +- tests/tool_use/test_jamba_tool_parser.py | 3 +- tests/tool_use/test_kimi_k2_tool_parser.py | 2 +- tests/tool_use/test_minimax_tool_parser.py | 2 +- tests/tool_use/test_openai_tool_parser.py | 2 +- tests/tool_use/test_qwen3coder_tool_parser.py | 3 +- tests/tool_use/test_seed_oss_tool_parser.py | 3 +- tests/tool_use/test_xlam_tool_parser.py | 3 +- tests/transformers_utils/test_config.py | 2 +- tests/utils.py | 2 +- .../v1/entrypoints/openai/test_completion.py | 2 +- tests/v1/tpu/test_perf.py | 2 +- vllm/benchmarks/serve.py | 2 +- vllm/model_executor/models/adapters.py | 4 +- vllm/model_executor/models/deepseek_ocr.py | 2 +- vllm/model_executor/models/deepseek_vl2.py | 2 +- vllm/model_executor/models/granite_speech.py | 8 +- vllm/model_executor/models/gritlm.py | 2 +- .../model_executor/models/nano_nemotron_vl.py | 7 +- vllm/model_executor/models/pixtral.py | 3 +- vllm/model_executor/models/voxtral.py | 3 +- vllm/model_executor/models/whisper.py | 6 +- vllm/multimodal/registry.py | 3 +- vllm/tokenizers/__init__.py | 11 ++- vllm/tokenizers/registry.py | 38 +++++++- vllm/transformers_utils/tokenizer.py | 91 +++++++++---------- vllm/v1/engine/async_llm.py | 5 +- vllm/v1/engine/llm_engine.py | 5 +- vllm/v1/structured_output/__init__.py | 4 +- 51 files changed, 150 insertions(+), 129 deletions(-) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 28fc383a318dd..e6391134ff932 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -40,7 +40,7 @@ from vllm.engine.arg_utils import EngineArgs from vllm.utils.argparse_utils import FlexibleArgumentParser try: - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer except ImportError: from backend_request_func import get_tokenizer diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 55001cf3722a0..df122b4c5e8db 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -46,7 +46,7 @@ from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase try: - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer except ImportError: from backend_request_func import get_tokenizer diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py index c20aea822fe81..1966b03cd9c89 100644 --- a/tests/compile/test_dynamic_shapes_compilation.py +++ b/tests/compile/test_dynamic_shapes_compilation.py @@ -8,7 +8,7 @@ import torch from vllm import LLM, SamplingParams from vllm.config.compilation import CompilationMode, DynamicShapesType -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.utils.torch_utils import is_torch_equal_or_newer diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index ee79ed59c4102..77087ac21ea8b 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -6,7 +6,7 @@ import pytest from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template from vllm.entrypoints.openai.protocol import ChatCompletionRequest -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from ...models.registry import HF_EXAMPLE_MODELS from ...utils import VLLM_PATH diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 4856cafef44b3..ea6b3d812d8fe 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -14,7 +14,7 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.lora.request import LoRARequest from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.v1.engine.async_llm import AsyncLLM MODEL_NAME = "openai-community/gpt2" diff --git a/tests/entrypoints/openai/test_return_token_ids.py b/tests/entrypoints/openai/test_return_token_ids.py index feef48a36dfa1..8537082e3f8d1 100644 --- a/tests/entrypoints/openai/test_return_token_ids.py +++ b/tests/entrypoints/openai/test_return_token_ids.py @@ -3,7 +3,7 @@ import pytest -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from ...utils import RemoteOpenAIServer diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index cedf6ce160607..d4d9a6c5b6120 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -7,7 +7,7 @@ import pytest -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from ...utils import RemoteOpenAIServer diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 6a1b15c4131e0..9ea65f9fa6e7a 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -14,7 +14,7 @@ from vllm.config.multimodal import MultiModalConfig from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.v1.engine.async_llm import AsyncLLM from ...utils import RemoteOpenAIServer diff --git a/tests/entrypoints/openai/test_token_in_token_out.py b/tests/entrypoints/openai/test_token_in_token_out.py index 25eb5882be89c..c7f8abe27e6e0 100644 --- a/tests/entrypoints/openai/test_token_in_token_out.py +++ b/tests/entrypoints/openai/test_token_in_token_out.py @@ -7,7 +7,7 @@ import tempfile import pytest from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from ...utils import RemoteOpenAIServer diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 751f94319eb9f..052f9fecc18de 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -5,7 +5,7 @@ import pytest import pytest_asyncio import requests -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from ...utils import RemoteOpenAIServer diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py index b2303ab0e7b7c..ce6727bb04f6c 100644 --- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py @@ -271,7 +271,7 @@ async def test_streaming_product_tool_call(): @pytest.fixture def qwen_tokenizer() -> TokenizerLike: - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer return get_tokenizer("Qwen/Qwen3-32B") diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py index 6aac649bc3035..ddba1c790ba8c 100644 --- a/tests/entrypoints/pooling/embed/test_online.py +++ b/tests/entrypoints/pooling/embed/test_online.py @@ -18,7 +18,7 @@ from tests.utils import RemoteOpenAIServer from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse from vllm.platforms import current_platform -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.utils.serial_utils import ( EMBED_DTYPE_TO_TORCH_DTYPE, ENDIANNESS, diff --git a/tests/entrypoints/pooling/pooling/test_online.py b/tests/entrypoints/pooling/pooling/test_online.py index 977c74d54a351..cc5c2f26f80fb 100644 --- a/tests/entrypoints/pooling/pooling/test_online.py +++ b/tests/entrypoints/pooling/pooling/test_online.py @@ -12,7 +12,7 @@ import torch from tests.models.utils import check_embeddings_close from tests.utils import RemoteOpenAIServer from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.utils.serial_utils import ( EMBED_DTYPE_TO_TORCH_DTYPE, ENDIANNESS, diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index a351cda60621f..03a0c058ea690 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -28,8 +28,7 @@ from vllm.multimodal.utils import ( encode_image_base64, encode_video_base64, ) -from vllm.tokenizers import MistralTokenizer -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import MistralTokenizer, get_tokenizer from ..models.registry import HF_EXAMPLE_MODELS from ..utils import VLLM_PATH diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index c39e522100901..90158a028b0bd 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import MultiModalInputs from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext -from vllm.tokenizers import MistralTokenizer -from vllm.transformers_utils.tokenizer import ( - cached_tokenizer_from_config, - encode_tokens, -) +from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config +from vllm.transformers_utils.tokenizer import encode_tokens from ....multimodal.utils import random_audio, random_image, random_video from ...registry import ( diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 66a3fbe11b6a5..7628ab4fe2349 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -31,7 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.platforms import current_platform -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.tokenizers import cached_tokenizer_from_config from vllm.utils.collection_utils import is_list_of from vllm.utils.torch_utils import set_default_torch_dtype diff --git a/tests/models/utils.py b/tests/models/utils.py index 9843887a13204..d84b4b820533e 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -13,7 +13,7 @@ from transformers import PretrainedConfig from vllm.config.model import ModelConfig, ModelDType, RunnerOption from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.multimodal.processing import InputProcessingContext -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.tokenizers import cached_tokenizer_from_config from .. import ci_envs from .registry import HF_EXAMPLE_MODELS diff --git a/tests/test_inputs.py b/tests/test_inputs.py index b1fb4e06a6906..c4339827de8b6 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -7,7 +7,7 @@ from vllm.config import ModelConfig from vllm.inputs import zip_enc_dec_prompts from vllm.inputs.parse import parse_raw_prompts from vllm.inputs.preprocess import InputPreprocessor -from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs +from vllm.tokenizers import init_tokenizer_from_config pytestmark = pytest.mark.cpu_test @@ -108,7 +108,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): ) def test_preprocessor_always_mm_code_path(model_id, prompt): model_config = ModelConfig(model=model_id) - tokenizer = init_tokenizer_from_configs(model_config) + tokenizer = init_tokenizer_from_config(model_config) input_preprocessor = InputPreprocessor(model_config, tokenizer) # HF processor adds sep token diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py index 1fca633cc5cd7..b152227a5a50f 100644 --- a/tests/tokenizers_/test_basic.py +++ b/tests/tokenizers_/test_basic.py @@ -5,8 +5,7 @@ from typing import _get_protocol_attrs # type: ignore import pytest from transformers import PreTrainedTokenizerBase -from vllm.tokenizers import TokenizerLike -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import TokenizerLike, get_tokenizer def _get_missing_attrs(obj: object, target: type): diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py index 57b6a14a54b3f..7e795350d64c8 100644 --- a/tests/tokenizers_/test_registry.py +++ b/tests/tokenizers_/test_registry.py @@ -2,8 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path -from vllm.tokenizers import TokenizerLike, TokenizerRegistry -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer class TestTokenizer(TokenizerLike): diff --git a/tests/tool_use/test_deepseekv31_tool_parser.py b/tests/tool_use/test_deepseekv31_tool_parser.py index db5168071fbce..8beb7739b6081 100644 --- a/tests/tool_use/test_deepseekv31_tool_parser.py +++ b/tests/tool_use/test_deepseekv31_tool_parser.py @@ -6,7 +6,7 @@ import pytest from vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser import ( DeepSeekV31ToolParser, ) -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer MODEL = "deepseek-ai/DeepSeek-V3.1" diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py index 8fbbbba325385..92f86de23267b 100644 --- a/tests/tool_use/test_ernie45_moe_tool_parser.py +++ b/tests/tool_use/test_ernie45_moe_tool_parser.py @@ -14,9 +14,8 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, ) from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import get_tokenizer # Use a common model that is likely to be available MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking" diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py index f545f52c02dcb..753b3f1c23adf 100644 --- a/tests/tool_use/test_glm4_moe_tool_parser.py +++ b/tests/tool_use/test_glm4_moe_tool_parser.py @@ -10,7 +10,7 @@ from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall from vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser import ( Glm4MoeModelToolParser, ) -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py index c7ca024f3a767..9036bd32dd704 100644 --- a/tests/tool_use/test_jamba_tool_parser.py +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -10,9 +10,8 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py index 3a48b5206141d..1558a9c3e01f2 100644 --- a/tests/tool_use/test_kimi_k2_tool_parser.py +++ b/tests/tool_use/test_kimi_k2_tool_parser.py @@ -8,7 +8,7 @@ import pytest from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall from vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser import KimiK2ToolParser -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_minimax_tool_parser.py b/tests/tool_use/test_minimax_tool_parser.py index 4332984083dab..dda63f984a832 100644 --- a/tests/tool_use/test_minimax_tool_parser.py +++ b/tests/tool_use/test_minimax_tool_parser.py @@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, ) from vllm.entrypoints.openai.tool_parsers.minimax_tool_parser import MinimaxToolParser -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_openai_tool_parser.py b/tests/tool_use/test_openai_tool_parser.py index c874a9601ae70..6537f281c0e1b 100644 --- a/tests/tool_use/test_openai_tool_parser.py +++ b/tests/tool_use/test_openai_tool_parser.py @@ -16,7 +16,7 @@ from openai_harmony import ( from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall from vllm.entrypoints.openai.tool_parsers.openai_tool_parser import OpenAIToolParser -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer MODEL = "gpt2" diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py index 864bb0d0c06c2..5a56768805fdf 100644 --- a/tests/tool_use/test_qwen3coder_tool_parser.py +++ b/tests/tool_use/test_qwen3coder_tool_parser.py @@ -17,9 +17,8 @@ from vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser import ( Qwen3CoderToolParser, ) from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py index d94df61128c9c..8795c35a1347f 100644 --- a/tests/tool_use/test_seed_oss_tool_parser.py +++ b/tests/tool_use/test_seed_oss_tool_parser.py @@ -15,9 +15,8 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, ) from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py index fdcdd4038131a..3098fda036a81 100644 --- a/tests/tool_use/test_xlam_tool_parser.py +++ b/tests/tool_use/test_xlam_tool_parser.py @@ -13,9 +13,8 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, ) from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally -from vllm.transformers_utils.tokenizer import get_tokenizer pytestmark = pytest.mark.cpu_test diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py index 7b56c9f0189d4..85680c41ed74d 100644 --- a/tests/transformers_utils/test_config.py +++ b/tests/transformers_utils/test_config.py @@ -6,8 +6,8 @@ only get the `eos_token_id` from the tokenizer as defined by `vllm.LLMEngine._get_eos_token_id`. """ +from vllm.tokenizers import get_tokenizer from vllm.transformers_utils.config import try_get_generation_config -from vllm.transformers_utils.tokenizer import get_tokenizer def test_get_llama3_eos_token(): diff --git a/tests/utils.py b/tests/utils.py index 9565b0ff06e36..539f67c47ac1d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -44,7 +44,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.cli.serve import ServeSubcommand from vllm.model_executor.model_loader import get_model_loader from vllm.platforms import current_platform -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.mem_constants import GB_bytes from vllm.utils.network_utils import get_open_port diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 736ccbefbc4da..ddab006d0d31a 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -9,7 +9,7 @@ import regex as re from openai import BadRequestError from tests.utils import RemoteOpenAIServer -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py index e230491cddb01..e62b969fe3b95 100644 --- a/tests/v1/tpu/test_perf.py +++ b/tests/v1/tpu/test_perf.py @@ -14,7 +14,7 @@ import pytest from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer if TYPE_CHECKING: from tests.conftest import VllmRunner diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 519303c0bfa0a..2933f5d01b274 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -47,7 +47,7 @@ from vllm.benchmarks.lib.endpoint_request_func import ( ) from vllm.benchmarks.lib.ready_checker import wait_for_endpoint from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.utils.gc_utils import freeze_gc_heap from vllm.utils.network_utils import join_host_port diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 05f257feea3ee..007d847ac3b7b 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -444,7 +444,7 @@ def load_weights_using_from_2_way_softmax( ) loaded_weights = pooling_model_cls.load_weights(model, weights, load_lm_head=True) - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer tokenizer = get_tokenizer( model_config.tokenizer, @@ -498,7 +498,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te # Skip ModelForSequenceClassification in MRO to avoid infinite recursion loaded_weights = type(model).__mro__[1].load_weights(model, weights) - from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.tokenizers import get_tokenizer tokenizer = get_tokenizer( model_config.tokenizer, diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index 8179f916ff417..019fb3e29ab91 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -45,6 +45,7 @@ from vllm.multimodal.processing import ( from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors +from vllm.tokenizers import cached_tokenizer_from_config from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config from vllm.transformers_utils.processors.deepseek_ocr import ( BASE_SIZE, @@ -53,7 +54,6 @@ from vllm.transformers_utils.processors.deepseek_ocr import ( DeepseekOCRProcessor, count_tiles, ) -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.v1.sample.logits_processor import ( AdapterLogitsProcessor, diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 1b6e4110039c4..56c1a87a25401 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -41,13 +41,13 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.tokenizers import cached_tokenizer_from_config from vllm.transformers_utils.configs.deepseek_vl2 import ( DeepseekVLV2Config, MlpProjectorConfig, VisionEncoderConfig, ) from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.torch_utils import set_default_torch_dtype diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 1797adab8d146..accf7e6ef2f47 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -59,8 +59,8 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.processor import cached_get_processor -from vllm.transformers_utils.tokenizer import cached_get_tokenizer +from vllm.tokenizers import cached_tokenizer_from_config +from vllm.transformers_utils.processor import cached_processor_from_config from vllm.utils.tensor_schema import TensorSchema, TensorShape from .blip2 import Blip2QFormerModel @@ -862,7 +862,7 @@ class GraniteSpeechForConditionalGeneration( else: raise ValueError(f"Unsupported task type {task_type}") - tokenizer = cached_get_tokenizer(model_config.model) + tokenizer = cached_tokenizer_from_config(model_config) chat = [dict(role="user", content=user_prompt)] prompt = tokenizer.apply_chat_template( chat, @@ -886,7 +886,7 @@ class GraniteSpeechForConditionalGeneration( model_config: ModelConfig, ) -> int | None: """Get the number of audio tokens for an audio duration in sec.""" - processor = cached_get_processor(model_config.model) + processor = cached_processor_from_config(model_config) hop_length = processor.audio_processor.melspec_kwargs["hop_length"] proj_win_size = processor.audio_processor.projector_window_size ds_rate = processor.audio_processor.projector_downsample_rate diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 181c4ed2dca5a..550e8b014d5e7 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -19,7 +19,7 @@ from vllm.model_executor.layers.pooler import ( ) from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.tasks import PoolingTask -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.tokenizers import cached_tokenizer_from_config from vllm.v1.outputs import PoolerOutput from vllm.v1.pool.metadata import PoolingMetadata diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 11beeddabe307..0f86a17752802 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -73,12 +73,9 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config from vllm.transformers_utils.configs.radio import RadioConfig -from vllm.transformers_utils.tokenizer import ( - cached_tokenizer_from_config, - encode_tokens, -) +from vllm.transformers_utils.tokenizer import encode_tokens from vllm.utils.tensor_schema import TensorSchema, TensorShape from .utils import _merge_multimodal_embeddings diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 54bde75cc0131..cad241842cd30 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -59,8 +59,7 @@ from vllm.multimodal.processing import ( from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.tokenizers import MistralTokenizer -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 0a39ea7ef5bff..45f8fa079c714 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -51,8 +51,7 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from vllm.tokenizers import MistralTokenizer -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription from .utils import init_vllm_registered_model, maybe_prefix diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 1ed6ae4366d0c..0daf6bda61ccb 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -48,7 +48,7 @@ from vllm.multimodal.processing import ( PromptUpdate, ) from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.transformers_utils.processor import cached_get_processor +from vllm.transformers_utils.processor import cached_processor_from_config from vllm.utils.jsontree import json_map_leaves from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.torch_utils import set_default_torch_dtype @@ -850,7 +850,7 @@ class WhisperForConditionalGeneration( def get_speech_to_text_config( cls, model_config: ModelConfig, task_type: str ) -> SpeechToTextConfig: - processor = cached_get_processor(model_config.model) + processor = cached_processor_from_config(model_config) return SpeechToTextConfig( max_audio_clip_s=processor.feature_extractor.chunk_length, @@ -864,7 +864,7 @@ class WhisperForConditionalGeneration( stt_config: SpeechToTextConfig, model_config: ModelConfig, ) -> int | None: - processor = cached_get_processor(model_config.model) + processor = cached_processor_from_config(model_config) hop_length = processor.feature_extractor.hop_length assert hop_length is not None # NOTE(NickLucche) user can't pass encoder diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 2fdae46e547b0..00a84f9dec4f7 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,8 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger -from vllm.tokenizers import TokenizerLike -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config from .cache import BaseMultiModalProcessorCache from .processing import ( diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py index 14f0148cf7ba8..42487f5f51651 100644 --- a/vllm/tokenizers/__init__.py +++ b/vllm/tokenizers/__init__.py @@ -4,12 +4,21 @@ from .hf import HfTokenizer from .mistral import MistralTokenizer from .protocol import TokenizerLike -from .registry import TokenizerRegistry, get_tokenizer +from .registry import ( + TokenizerRegistry, + cached_get_tokenizer, + cached_tokenizer_from_config, + get_tokenizer, + init_tokenizer_from_config, +) __all__ = [ "TokenizerLike", "HfTokenizer", "MistralTokenizer", "TokenizerRegistry", + "cached_get_tokenizer", "get_tokenizer", + "cached_tokenizer_from_config", + "init_tokenizer_from_config", ] diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py index d5e7899321615..bf9d295de23ae 100644 --- a/vllm/tokenizers/registry.py +++ b/vllm/tokenizers/registry.py @@ -2,10 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib.util from collections.abc import Callable +from functools import lru_cache from pathlib import Path -from typing import TypeVar, overload +from typing import TYPE_CHECKING, TypeVar, overload import huggingface_hub +from typing_extensions import assert_never import vllm.envs as envs from vllm.logger import init_logger @@ -21,6 +23,9 @@ from vllm.utils.import_utils import resolve_obj_by_qualname from .protocol import TokenizerLike +if TYPE_CHECKING: + from vllm.config import ModelConfig + logger = init_logger(__name__) _T = TypeVar("_T", bound=type[TokenizerLike]) @@ -195,3 +200,34 @@ def get_tokenizer( ) return tokenizer + + +cached_get_tokenizer = lru_cache(get_tokenizer) + + +def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs): + return cached_get_tokenizer( + model_config.tokenizer, + tokenizer_mode=model_config.tokenizer_mode, + revision=model_config.tokenizer_revision, + trust_remote_code=model_config.trust_remote_code, + **kwargs, + ) + + +def init_tokenizer_from_config(model_config: "ModelConfig"): + runner_type = model_config.runner_type + if runner_type == "generate" or runner_type == "draft": + truncation_side = "left" + elif runner_type == "pooling": + truncation_side = "right" + else: + assert_never(runner_type) + + return get_tokenizer( + model_config.tokenizer, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code, + revision=model_config.tokenizer_revision, + truncation_side=truncation_side, + ) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 0911848c02e14..617d16779ca26 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -2,17 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings -from functools import lru_cache -from typing import TYPE_CHECKING, Any - -from typing_extensions import assert_never +from typing import Any from vllm.logger import init_logger -from vllm.tokenizers import TokenizerLike, get_tokenizer - -if TYPE_CHECKING: - from vllm.config import ModelConfig - +from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) @@ -28,18 +21,54 @@ def __getattr__(name: str): ) return TokenizerLike - if name == "get_cached_tokenizer": - from vllm.tokenizers.hf import get_cached_tokenizer + if name == "get_tokenizer": + from vllm.tokenizers import get_tokenizer warnings.warn( - "`vllm.transformers_utils.tokenizer.get_cached_tokenizer` " - "has been moved to `vllm.tokenizers.hf.get_cached_tokenizer`. " + "`vllm.transformers_utils.tokenizer.get_tokenizer` " + "has been moved to `vllm.tokenizers.get_tokenizer`. " "The old name will be removed in v0.13.", DeprecationWarning, stacklevel=2, ) - return get_cached_tokenizer + return get_tokenizer + if name == "cached_get_tokenizer": + from vllm.tokenizers import cached_get_tokenizer + + warnings.warn( + "`vllm.transformers_utils.tokenizer.cached_get_tokenizer` " + "has been moved to `vllm.tokenizers.cached_get_tokenizer`. " + "The old name will be removed in v0.13.", + DeprecationWarning, + stacklevel=2, + ) + + return cached_get_tokenizer + if name == "cached_tokenizer_from_config": + from vllm.tokenizers import cached_tokenizer_from_config + + warnings.warn( + "`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` " + "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. " + "The old name will be removed in v0.13.", + DeprecationWarning, + stacklevel=2, + ) + + return cached_tokenizer_from_config + if name == "init_tokenizer_from_configs": + from vllm.tokenizers import init_tokenizer_from_config + + warnings.warn( + "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` " + "has been moved to `vllm.tokenizers.init_tokenizer_from_config`. " + "The old name will be removed in v0.13.", + DeprecationWarning, + stacklevel=2, + ) + + return init_tokenizer_from_config raise AttributeError(f"module {__name__!r} has no attribute {name!r}") @@ -92,37 +121,3 @@ def encode_tokens( kw_args["add_special_tokens"] = add_special_tokens return tokenizer.encode(text, **kw_args) - - -cached_get_tokenizer = lru_cache(get_tokenizer) - - -def cached_tokenizer_from_config( - model_config: "ModelConfig", - **kwargs: Any, -): - return cached_get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - revision=model_config.tokenizer_revision, - trust_remote_code=model_config.trust_remote_code, - **kwargs, - ) - - -def init_tokenizer_from_configs(model_config: "ModelConfig"): - runner_type = model_config.runner_type - if runner_type == "generate" or runner_type == "draft": - truncation_side = "left" - elif runner_type == "pooling": - truncation_side = "right" - else: - assert_never(runner_type) - - return get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, - revision=model_config.tokenizer_revision, - truncation_side=truncation_side, - ) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 17a271ca42e26..ec5d6e95ce3aa 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -26,10 +26,9 @@ from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config from vllm.tracing import init_tracer from vllm.transformers_utils.config import maybe_register_config_serialize_by_value -from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils.async_utils import cancel_task_threadsafe from vllm.utils.collection_utils import as_list @@ -112,7 +111,7 @@ class AsyncLLM(EngineClient): if self.model_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_configs(self.model_config) + tokenizer = init_tokenizer_from_config(self.model_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index e7dfc554e76fa..d21cdf04ead26 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -23,9 +23,8 @@ from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config from vllm.tracing import init_tracer -from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient @@ -87,7 +86,7 @@ class LLMEngine: if self.model_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_configs(self.model_config) + tokenizer = init_tokenizer_from_config(self.model_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 029129cf1a475..d087d28b1dae3 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager -from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs +from vllm.tokenizers import init_tokenizer_from_config from vllm.utils.import_utils import LazyLoader from vllm.v1.structured_output.backend_guidance import GuidanceBackend from vllm.v1.structured_output.backend_types import ( @@ -61,7 +61,7 @@ class StructuredOutputManager: # of CPUs. max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) self.executor = ThreadPoolExecutor(max_workers=max_workers) - self.tokenizer = init_tokenizer_from_configs( + self.tokenizer = init_tokenizer_from_config( model_config=self.vllm_config.model_config ) reasoning_parser = ( From 4b612664fdfb4e87af6684403872d83ac04fa496 Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Tue, 2 Dec 2025 14:17:10 +0800 Subject: [PATCH 12/45] [CI] Renovation of nightly wheel build & generation (take 2) (#29838) Signed-off-by: Shengqi Chen --- .buildkite/release-pipeline.yaml | 16 +- .buildkite/scripts/generate-nightly-index.py | 369 ++++++++++++++++++ .buildkite/scripts/upload-wheels.sh | 119 +++--- .buildkite/test-pipeline.yaml | 3 +- docs/getting_started/installation/cpu.md | 15 +- .../installation/gpu.cuda.inc.md | 75 ++-- docs/getting_started/installation/gpu.md | 2 +- setup.py | 148 +++++-- vllm/envs.py | 7 +- 9 files changed, 606 insertions(+), 148 deletions(-) create mode 100644 .buildkite/scripts/generate-nightly-index.py diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 38c400ba1faf5..fbfc923998f89 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -8,7 +8,7 @@ steps: commands: # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "bash .buildkite/scripts/upload-wheels.sh" @@ -30,19 +30,6 @@ steps: DOCKER_BUILDKIT: "1" # x86 + CUDA builds - - label: "Build wheel - CUDA 12.8" - depends_on: ~ - id: build-wheel-cuda-12-8 - agents: - queue: cpu_queue_postmerge - commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - - "mkdir artifacts" - - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-wheels.sh" - env: - DOCKER_BUILDKIT: "1" - - label: "Build wheel - CUDA 12.9" depends_on: ~ id: build-wheel-cuda-12-9 @@ -109,7 +96,6 @@ steps: - label: "Annotate release workflow" depends_on: - create-multi-arch-manifest - - build-wheel-cuda-12-8 id: annotate-release-workflow agents: queue: cpu_queue_postmerge diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py new file mode 100644 index 0000000000000..8d09ba178db7b --- /dev/null +++ b/.buildkite/scripts/generate-nightly-index.py @@ -0,0 +1,369 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# do not complain about line length (for docstring) +# ruff: noqa: E501 + +import argparse +import json +import re +import sys +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Any +from urllib.parse import quote + +if not sys.version_info >= (3, 12): + raise RuntimeError("This script requires Python 3.12 or higher.") + +INDEX_HTML_TEMPLATE = """ + + + +{items} + + +""" + + +@dataclass +class WheelFileInfo: + package_name: str + version: str + build_tag: str | None + python_tag: str + abi_tag: str + platform_tag: str + variant: str | None + filename: str + + +def parse_from_filename(file: str) -> WheelFileInfo: + """ + Parse wheel file name to extract metadata. + + The format of wheel names: + {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl + All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not). + Example: + vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl + vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl + vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl + vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl + """ + wheel_file_re = re.compile( + r"^(?P.+)-(?P[^-]+?)(-(?P[^-]+))?-(?P[^-]+)-(?P[^-]+)-(?P[^-]+)\.whl$" + ) + match = wheel_file_re.match(file) + if not match: + raise ValueError(f"Invalid wheel file name: {file}") + + package_name = match.group("package_name") + version = match.group("version") + build_tag = match.group("build_tag") + python_tag = match.group("python_tag") + abi_tag = match.group("abi_tag") + platform_tag = match.group("platform_tag") + + # extract variant from version + variant = None + if "dev" in version: + ver_after_dev = version.split("dev")[-1] + if "." in ver_after_dev: + variant = ver_after_dev.split(".")[-1] + version = version.removesuffix("." + variant) + else: + if "+" in version: + version, variant = version.split("+") + + return WheelFileInfo( + package_name=package_name, + version=version, + build_tag=build_tag, + python_tag=python_tag, + abi_tag=abi_tag, + platform_tag=platform_tag, + variant=variant, + filename=file, + ) + + +def generate_project_list(subdir_names: list[str]) -> str: + """ + Generate project list HTML content linking to each project & variant sub-directory. + """ + href_tags = [] + for name in sorted(subdir_names): + name = name.strip("/").strip(".") + href_tags.append(f' {name}/
') + return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags)) + + +def generate_package_index_and_metadata( + wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path +) -> tuple[str, str]: + """ + Generate package index HTML content for a specific package, linking to actual wheel files. + """ + href_tags = [] + metadata = [] + for file in sorted(wheel_files, key=lambda x: x.filename): + relative_path = ( + wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename + ) + # handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B' + # NOTE: this is AWS S3 specific behavior! + file_path_quoted = quote(relative_path.as_posix(), safe=":%/") + href_tags.append(f' {file.filename}
') + file_meta = asdict(file) + file_meta["path"] = file_path_quoted + metadata.append(file_meta) + index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags)) + metadata_str = json.dumps(metadata, indent=2) + return index_str, metadata_str + + +def generate_index_and_metadata( + whl_files: list[str], + wheel_base_dir: Path, + index_base_dir: Path, + default_variant: str | None = None, + alias_to_default: str | None = None, +): + """ + Generate index for all wheel files. + + Args: + whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`). + wheel_base_dir (Path): Base directory for wheel files. + index_base_dir (Path): Base directory to store index files. + default_variant (str | None): The default variant name, if any. + alias_to_default (str | None): Alias variant name for the default variant, if any. + + First, parse all wheel files to extract metadata. + We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory). + The index for the default variant (if any) is generated in the root index directory. + + If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index + is purely a copy of the corresponding variant index, with only the links adjusted. + Otherwise, all wheels without variant suffixes are treated as the default variant. + + If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content + as the default variant index, but the links are adjusted accordingly. + + Index directory structure: + index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/) + index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories + vllm/ + index.html # package index, pointing to actual files in wheel_base_dir (relative path) + metadata.json # machine-readable metadata for all wheels in this package + cpu/ # cpu variant sub-directory + index.html + vllm/ + index.html + metadata.json + cu129/ # cu129 is actually the alias to default variant + index.html + vllm/ + index.html + metadata.json + cu130/ # cu130 variant sub-directory + index.html + vllm/ + index.html + metadata.json + ... + + metadata.json stores a dump of all wheel files' metadata in a machine-readable format: + [ + { + "package_name": "vllm", + "version": "0.10.2rc2", + "build_tag": null, + "python_tag": "cp38", + "abi_tag": "abi3", + "platform_tag": "manylinux2014_aarch64", + "variant": "cu129", + "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl", + "path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded + }, + ... + ] + """ + + parsed_files = [parse_from_filename(f) for f in whl_files] + + if not parsed_files: + print("No wheel files found, skipping index generation.") + return + + # Group by variant + variant_to_files: dict[str, list[WheelFileInfo]] = {} + for file in parsed_files: + variant = file.variant or "default" + if variant not in variant_to_files: + variant_to_files[variant] = [] + variant_to_files[variant].append(file) + + print(f"Found variants: {list(variant_to_files.keys())}") + + # sanity check for default variant + if default_variant: + if "default" in variant_to_files: + raise ValueError( + "All wheel files must have variant suffixes when `default_variant` is specified." + ) + if default_variant not in variant_to_files: + raise ValueError( + f"Default variant '{default_variant}' not found among wheel files." + ) + + if alias_to_default: + if "default" not in variant_to_files: + # e.g. only some wheels are uploaded to S3 currently + print( + "[WARN] Alias to default variant specified, but no default variant found." + ) + elif alias_to_default in variant_to_files: + raise ValueError( + f"Alias variant name '{alias_to_default}' already exists among wheel files." + ) + else: + variant_to_files[alias_to_default] = variant_to_files["default"].copy() + print(f"Alias variant '{alias_to_default}' created for default variant.") + + # Generate index for each variant + subdir_names = set() + for variant, files in variant_to_files.items(): + if variant == "default": + variant_dir = index_base_dir + else: + variant_dir = index_base_dir / variant + subdir_names.add(variant) + + variant_dir.mkdir(parents=True, exist_ok=True) + + # gather all package names in this variant + packages = set(f.package_name for f in files) + if variant == "default": + # these packages should also appear in the "project list" + # generate after all variants are processed + subdir_names = subdir_names.union(packages) + else: + # generate project list for this variant directly + project_list_str = generate_project_list(sorted(packages)) + with open(variant_dir / "index.html", "w") as f: + f.write(project_list_str) + + for package in packages: + # filter files belonging to this package only + package_files = [f for f in files if f.package_name == package] + package_dir = variant_dir / package + package_dir.mkdir(parents=True, exist_ok=True) + index_str, metadata_str = generate_package_index_and_metadata( + package_files, wheel_base_dir, package_dir + ) + with open(package_dir / "index.html", "w") as f: + f.write(index_str) + with open(package_dir / "metadata.json", "w") as f: + f.write(metadata_str) + + # Generate top-level project list index + project_list_str = generate_project_list(sorted(subdir_names)) + with open(index_base_dir / "index.html", "w") as f: + f.write(project_list_str) + + +if __name__ == "__main__": + """ + Arguments: + --version : version string for the current build (e.g., commit hash) + --current-objects : path to JSON file containing current S3 objects listing in this version directory + --output-dir : directory to store generated index files + --alias-to-default : (optional) alias variant name for the default variant + """ + + parser = argparse.ArgumentParser( + description="Process nightly build wheel files to generate indices." + ) + parser.add_argument( + "--version", + type=str, + required=True, + help="Version string for the current build (e.g., commit hash)", + ) + parser.add_argument( + "--current-objects", + type=str, + required=True, + help="Path to JSON file containing current S3 objects listing in this version directory", + ) + parser.add_argument( + "--output-dir", + type=str, + required=True, + help="Directory to store generated index files", + ) + parser.add_argument( + "--alias-to-default", + type=str, + default=None, + help="Alias variant name for the default variant", + ) + + args = parser.parse_args() + + version = args.version + if "/" in version or "\\" in version: + raise ValueError("Version string must not contain slashes.") + current_objects_path = Path(args.current_objects) + output_dir = Path(args.output_dir) + if not output_dir.exists(): + output_dir.mkdir(parents=True, exist_ok=True) + + # Read current objects JSON + with open(current_objects_path) as f: + current_objects: dict[str, list[dict[str, Any]]] = json.load(f) + + # current_objects looks like from list_objects_v2 S3 API: + """ + "Contents": [ + { + "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl", + "LastModified": "2025-11-28T14:00:32+00:00", + "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"", + "ChecksumAlgorithm": [ + "CRC64NVME" + ], + "ChecksumType": "FULL_OBJECT", + "Size": 435649349, + "StorageClass": "STANDARD" + }, + ... + ] + """ + + # Extract wheel file keys + wheel_files = [] + for item in current_objects.get("Contents", []): + key: str = item["Key"] + if key.endswith(".whl"): + wheel_files.append(key.split("/")[-1]) # only the filename is used + + print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}") + + # Generate index and metadata, assuming wheels and indices are stored as: + # s3://vllm-wheels/{version}/ + # s3://vllm-wheels// + wheel_base_dir = Path(output_dir).parent / version + index_base_dir = Path(output_dir) + + generate_index_and_metadata( + whl_files=wheel_files, + wheel_base_dir=wheel_base_dir, + index_base_dir=index_base_dir, + default_variant=None, + alias_to_default=args.alias_to_default, + ) + print(f"Successfully generated index and metadata in {output_dir}") diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 945c5e48c0090..2eaa91c04086c 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -2,6 +2,28 @@ set -ex +# ======== part 0: setup ======== + +BUCKET="vllm-wheels" +INDICES_OUTPUT_DIR="indices" +DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py +PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3 +SUBPATH=$BUILDKITE_COMMIT +S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/" + +# detect if python3.10+ is available +has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)") +if [[ "$has_new_python" -eq 0 ]]; then + # use new python from docker + docker pull python:3-slim + PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3" +fi + +echo "Using python interpreter: $PYTHON" +echo "Python version: $($PYTHON --version)" + +# ========= part 1: collect, rename & upload the wheel ========== + # Assume wheels are in artifacts/dist/*.whl wheel_files=(artifacts/dist/*.whl) @@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}" exit 1 fi - -# Get the single wheel file wheel="${wheel_files[0]}" -# Detect architecture and rename 'linux' to appropriate manylinux version -arch=$(uname -m) -if [[ $arch == "x86_64" ]]; then - manylinux_version="manylinux1" -elif [[ $arch == "aarch64" ]]; then - manylinux_version="manylinux2014" -else - echo "Warning: Unknown architecture $arch, using manylinux1 as default" - manylinux_version="manylinux1" -fi +# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31 +# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels +manylinux_version="manylinux_2_31" # Rename 'linux' to the appropriate manylinux version in the wheel filename +if [[ "$wheel" != *"linux"* ]]; then + echo "Error: Wheel filename does not contain 'linux': $wheel" + exit 1 +fi new_wheel="${wheel/linux/$manylinux_version}" mv -- "$wheel" "$new_wheel" wheel="$new_wheel" +echo "Renamed wheel to: $wheel" # Extract the version from the wheel version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) -echo "Version: $version" +echo "Version in wheel: $version" +pure_version="${version%%+*}" +echo "Pure version (without variant): $pure_version" -normal_wheel="$wheel" # Save the original wheel filename +# copy wheel to its own bucket +aws s3 cp "$wheel" "$S3_COMMIT_PREFIX" -# If the version contains "dev", rename it to v1.0.0.dev for consistency -if [[ $version == *dev* ]]; then - suffix="${version##*.}" - if [[ $suffix == cu* ]]; then - new_version="1.0.0.dev+${suffix}" - else - new_version="1.0.0.dev" - fi - new_wheel="${wheel/$version/$new_version}" - # use cp to keep both files in the artifacts directory - cp -- "$wheel" "$new_wheel" - wheel="$new_wheel" - version="$new_version" -fi +# ========= part 2: generate and upload indices ========== +# generate indices for all existing wheels in the commit directory +# this script might be run multiple times if there are multiple variants being built +# so we need to guarantee there is little chance for "TOCTOU" issues +# i.e., one process is generating indices while another is uploading a new wheel +# so we need to ensure no time-consuming operations happen below -# Upload the wheel to S3 -python3 .buildkite/generate_index.py --wheel "$normal_wheel" +# list all wheels in the commit directory +echo "Existing wheels on S3:" +aws s3 ls "$S3_COMMIT_PREFIX" +obj_json="objects.json" +aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json" +mkdir -p "$INDICES_OUTPUT_DIR" -# generate index for this commit -aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" -aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" - -if [[ $normal_wheel == *"cu129"* ]]; then - # only upload index.html for cu129 wheels (default wheels) as it - # is available on both x86 and arm64 - aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" - aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" +# call script to generate indicies for all existing wheels +# this indices have relative paths that could work as long as it is next to the wheel directory in s3 +# i.e., the wheels are always in s3://vllm-wheels// +# and indices can be placed in //, or /nightly/, or // +if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then + alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS" else - echo "Skipping index files for non-cu129 wheels" + alias_arg="" fi -# generate index for nightly -aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" -aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" +$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg -if [[ $normal_wheel == *"cu129"* ]]; then - # only upload index.html for cu129 wheels (default wheels) as it - # is available on both x86 and arm64 - aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" -else - echo "Skipping index files for non-cu129 wheels" +# copy indices to // unconditionally +echo "Uploading indices to $S3_COMMIT_PREFIX" +aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX" + +# copy to /nightly/ only if it is on the main branch and not a PR +if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then + echo "Uploading indices to overwrite /nightly/" + aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/" fi -aws s3 cp "$wheel" "s3://vllm-wheels/$version/" -aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html" +# copy to // only if it does not have "dev" in the version +if [[ "$version" != *"dev"* ]]; then + echo "Uploading indices to overwrite /$pure_version/" + aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" +fi diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0e715a719d27d..9f2107fb1e5ab 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -215,7 +215,6 @@ steps: timeout_in_minutes: 10 gpu: h100 num_gpus: 8 - optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - examples/offline_inference/torchrun_dp_example.py @@ -1370,4 +1369,4 @@ steps: num_gpus: 2 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 \ No newline at end of file diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index d1beab7855b18..18dc6d19434b3 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -46,10 +46,23 @@ vLLM is a Python library that supports the following CPU variants. Select your C ### Pre-built wheels -Currently, there are no pre-built CPU wheels. +Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels). + +When specifying the index URL, please make sure to use the `cpu` variant subdirectory. +For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`. ### Build wheel from source +#### Set up using Python-only build (without compilation) {#python-only-build} + +Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with: + +```bash +VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable . +``` + +#### Full build (with compilation) {#full-build} + === "Intel/AMD x86" --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source" diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md index 601d3659af886..03ce28c78efc9 100644 --- a/docs/getting_started/installation/gpu.cuda.inc.md +++ b/docs/getting_started/installation/gpu.cuda.inc.md @@ -26,42 +26,49 @@ uv pip install vllm --torch-backend=auto ??? console "pip" ```bash - # Install vLLM with CUDA 12.8. - pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128 + # Install vLLM with CUDA 12.9. + pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129 ``` -We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first. +We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first. !!! note NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration. -As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions: +As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions: ```bash -# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6). +# Install vLLM with a specific CUDA version (e.g., 13.0). export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') -export CUDA_VERSION=118 # or 126 -uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION} +export CUDA_VERSION=130 # or other +uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION} ``` #### Install the latest code -LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`. +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on . There are multiple indices that could be used: + +* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9. +* `https://wheels.vllm.ai/nightly/`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency. + +To install from nightly index, run: ```bash uv pip install -U vllm \ --torch-backend=auto \ - --extra-index-url https://wheels.vllm.ai/nightly + --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed ``` -??? console "pip" - ```bash - pip install -U vllm \ - --pre \ - --extra-index-url https://wheels.vllm.ai/nightly - ``` +!!! warning "`pip` caveat" - `--pre` is required for `pip` to consider pre-released versions. + Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). + + If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page). + + ```bash + pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!) + pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit + ``` ##### Install specific revisions @@ -71,33 +78,13 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch uv pip install vllm \ --torch-backend=auto \ - --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} + --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed ``` -The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. - -??? note "pip" - If you want to access the wheels for previous commits (e.g. to bisect the behavior change, - performance regression), due to the limitation of `pip`, you have to specify the full URL of the - wheel file by embedding the commit hash in the URL: - - ```bash - export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch - pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl - ``` - - Note that the wheels are built with Python 3.8 ABI (see [PEP - 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible - with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a - placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in - the wheel metadata (the wheels listed in the extra index url have correct versions). Although we - don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the - wheels are still built with Python 3.8 ABI to keep the same wheel name as before. - # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] -#### Set up using Python-only build (without compilation) +#### Set up using Python-only build (without compilation) {#python-only-build} If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM: @@ -121,18 +108,24 @@ This command will do the following: In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable. ```bash -export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch -export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +export VLLM_PRECOMPILED_WHEEL_COMMIT=$(git rev-parse HEAD~1) # or earlier commit on main +export VLLM_USE_PRECOMPILED=1 uv pip install --editable . ``` +There are more environment variables to control the behavior of Python-only build: + +* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped. +* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch. +* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index. + You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code). !!! note There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel. -#### Full build (with compilation) +#### Full build (with compilation) {#full-build} If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md index bc7508b29475f..fb750f4499858 100644 --- a/docs/getting_started/installation/gpu.md +++ b/docs/getting_started/installation/gpu.md @@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python" -### Pre-built wheels +### Pre-built wheels {#pre-built-wheels} === "NVIDIA CUDA" diff --git a/setup.py b/setup.py index 0022e7fe0bf36..5b7d12bb373e3 100644 --- a/setup.py +++ b/setup.py @@ -311,7 +311,7 @@ class precompiled_build_ext(build_ext): """Disables extension building when using precompiled binaries.""" def run(self) -> None: - assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + return def build_extensions(self) -> None: print("Skipping build_ext: using precompiled extensions.") @@ -322,14 +322,121 @@ class precompiled_wheel_utils: """Extracts libraries and other files from an existing wheel.""" @staticmethod - def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict: + def fetch_metadata_for_variant( + commit: str, variant: str | None + ) -> tuple[list[dict], str]: + """ + Fetches metadata for a specific variant of the precompiled wheel. + """ + variant_dir = f"{variant}/" if variant is not None else "" + repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/" + meta_url = repo_url + "metadata.json" + print(f"Trying to fetch nightly build metadata from {meta_url}") + from urllib.request import urlopen + + with urlopen(meta_url) as resp: + # urlopen raises HTTPError on unexpected status code + wheels = json.loads(resp.read().decode("utf-8")) + return wheels, repo_url + + @staticmethod + def determine_wheel_url() -> tuple[str, str | None]: + """ + Try to determine the precompiled wheel URL or path to use. + The order of preference is: + 1. user-specified wheel location (can be either local or remote, via + VLLM_PRECOMPILED_WHEEL_LOCATION) + 2. user-specified variant from nightly repo (current main commit via + VLLM_PRECOMPILED_WHEEL_VARIANT) + 3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo + 4. the default variant from nightly repo (current main commit) + """ + wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) + if wheel_location is not None: + print(f"Using user-specified precompiled wheel location: {wheel_location}") + return wheel_location, None + else: + import platform + + arch = platform.machine() + # try to fetch the wheel metadata from the nightly wheel repo + main_variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "") + variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant) + commit = os.getenv( + "VLLM_PRECOMPILED_WHEEL_COMMIT", + precompiled_wheel_utils.get_base_commit_in_main_branch(), + ) + print(f"Using precompiled wheel commit {commit} with variant {variant}") + try_default = False + wheels, repo_url, download_filename = None, None, None + try: + wheels, repo_url = precompiled_wheel_utils.fetch_metadata_for_variant( + commit, variant + ) + except Exception as e: + logger.warning( + "Failed to fetch precompiled wheel metadata for variant %s: %s", + variant, + e, + ) + try_default = True # try outside handler to keep the stacktrace simple + if try_default: + print("Trying the default variant from remote") + wheels, repo_url = precompiled_wheel_utils.fetch_metadata_for_variant( + commit, None + ) + # if this also fails, then we have nothing more to try / cache + assert wheels is not None and repo_url is not None, ( + "Failed to fetch precompiled wheel metadata" + ) + # The metadata.json has the following format: + # see .buildkite/scripts/generate-nightly-index.py for details + """[{ + "package_name": "vllm", + "version": "0.11.2.dev278+gdbc3d9991", + "build_tag": null, + "python_tag": "cp38", + "abi_tag": "abi3", + "platform_tag": "manylinux1_x86_64", + "variant": null, + "filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl", + "path": "../vllm-0.11.2.dev278%2Bgdbc3d9991-cp38-abi3-manylinux1_x86_64.whl" + }, + ...]""" + from urllib.parse import urljoin + + for wheel in wheels: + # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc) + if wheel.get("package_name") == "vllm" and arch in wheel.get( + "platform_tag", "" + ): + print(f"Found precompiled wheel metadata: {wheel}") + if "path" not in wheel: + raise ValueError(f"Wheel metadata missing path: {wheel}") + wheel_url = urljoin(repo_url, wheel["path"]) + download_filename = wheel.get("filename") + print(f"Using precompiled wheel URL: {wheel_url}") + break + else: + raise ValueError( + f"No precompiled vllm wheel found for architecture {arch} " + f"from repo {repo_url}. All available wheels: {wheels}" + ) + + return wheel_url, download_filename + + @staticmethod + def extract_precompiled_and_patch_package( + wheel_url_or_path: str, download_filename: str | None + ) -> dict: import tempfile import zipfile temp_dir = None try: if not os.path.isfile(wheel_url_or_path): - wheel_filename = wheel_url_or_path.split("/")[-1] + # use provided filename first, then derive from URL + wheel_filename = download_filename or wheel_url_or_path.split("/")[-1] temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") wheel_path = os.path.join(temp_dir, wheel_filename) print(f"Downloading wheel from {wheel_url_or_path} to {wheel_path}") @@ -648,38 +755,13 @@ package_data = { ] } + # If using precompiled, extract and patch package_data (in advance of setup) if envs.VLLM_USE_PRECOMPILED: - assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" - wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) - if wheel_location is not None: - wheel_url = wheel_location - else: - import platform - - arch = platform.machine() - if arch == "x86_64": - wheel_tag = "manylinux1_x86_64" - elif arch == "aarch64": - wheel_tag = "manylinux2014_aarch64" - else: - raise ValueError(f"Unsupported architecture: {arch}") - base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch() - wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl" - nightly_wheel_url = ( - f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl" - ) - from urllib.request import urlopen - - try: - with urlopen(wheel_url) as resp: - if resp.status != 200: - wheel_url = nightly_wheel_url - except Exception as e: - print(f"[warn] Falling back to nightly wheel: {e}") - wheel_url = nightly_wheel_url - - patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url) + wheel_url, download_filename = precompiled_wheel_utils.determine_wheel_url() + patch = precompiled_wheel_utils.extract_precompiled_and_patch_package( + wheel_url, download_filename + ) for pkg, files in patch.items(): package_data.setdefault(pkg, []).extend(files) diff --git a/vllm/envs.py b/vllm/envs.py index 46f1aa3222be7..d0912863e6444 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -74,7 +74,7 @@ if TYPE_CHECKING: VLLM_MEDIA_CONNECTOR: str = "http" VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" - VLLM_MAIN_CUDA_VERSION: str = "12.8" + VLLM_MAIN_CUDA_VERSION: str = "12.9" MAX_JOBS: str | None = None NVCC_THREADS: str | None = None VLLM_USE_PRECOMPILED: bool = False @@ -445,10 +445,9 @@ environment_variables: dict[str, Callable[[], Any]] = { # Target device of vLLM, supporting [cuda (by default), # rocm, cpu] "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(), - # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9], - # 12.8 is the default. This follows PyTorch but can be overridden. + # Main CUDA version of vLLM. This follows PyTorch but can be overridden. "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() - or "12.8", + or "12.9", # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None), From 13ea39bc09cf4c102ba4ad308df379dc5abc3ba4 Mon Sep 17 00:00:00 2001 From: Zhang Xiangze Date: Tue, 2 Dec 2025 14:21:39 +0800 Subject: [PATCH 13/45] [CPU]Parallelize over tokens in int4 moe (#29600) Signed-off-by: Zhang Xiangze --- csrc/moe/dynamic_4bit_int_moe_cpu.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp index df47bb8dd1d7d..58dc402016881 100644 --- a/csrc/moe/dynamic_4bit_int_moe_cpu.cpp +++ b/csrc/moe/dynamic_4bit_int_moe_cpu.cpp @@ -93,16 +93,16 @@ torch::Tensor dynamic_4bit_int_moe_cpu( } auto Y_all = at::empty({offsets[E], H}, x_c.options()); - at::parallel_for(0, E, 1, [&](int64_t e_begin, int64_t e_end) { + at::parallel_for(0, offsets[E], 0, [&](int64_t idx_begin, int64_t idx_end) { c10::InferenceMode guard; - for (int64_t e = e_begin; e < e_end; ++e) { - const int64_t te = counts[e]; - if (te == 0) { + for (int64_t e = 0; e < E; ++e) { + int64_t start = std::max(offsets[e], idx_begin); + int64_t end = std::min(offsets[e + 1], idx_end); + int64_t te = end - start; + if (te <= 0) { continue; } - const int64_t start = offsets[e]; - auto x_e = X_all.narrow(/*dim=*/0, /*start=*/start, /*length=*/te); auto w13_e = w13_packed.select(/*dim=*/0, e); From f5b0846ba0aa6a4b6ab788ff257d0a00eb376e75 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Dec 2025 07:05:27 +0000 Subject: [PATCH 14/45] Fix some Transformers nightly tests (#29802) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/jina_vl.py | 2 +- vllm/model_executor/models/modernbert.py | 51 ++++++++++++------------ vllm/model_executor/models/qwen2.py | 2 +- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py index 05a40837954d8..8bba7b62882f1 100644 --- a/vllm/model_executor/models/jina_vl.py +++ b/vllm/model_executor/models/jina_vl.py @@ -29,7 +29,7 @@ logger = init_logger(__name__) class JinaVLScorer(nn.Module): def __init__(self, model_config: "ModelConfig"): super().__init__() - config = model_config.hf_config + config = model_config.hf_config.get_text_config() head_dtype = model_config.head_dtype self.dense = ColumnParallelLinear( config.hidden_size, config.hidden_size, params_dtype=head_dtype, bias=True diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 743bc23d9876f..be36f761c63aa 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -20,7 +20,7 @@ from vllm.model_executor.layers.pooler import ( PoolingParamsUpdate, PoolingType, ) -from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors @@ -62,19 +62,6 @@ class ModernBertEmbeddings(nn.Module): return embeddings -class ModernBertRotaryEmbedding(RotaryEmbedding): - def __init__(self, config: ModernBertConfig, head_size: int, dim: int, base: float): - super().__init__( - head_size=head_size, - rotary_dim=dim, - max_position_embeddings=config.max_position_embeddings, - base=base, - is_neox_style=True, - dtype=torch.float16, - ) - self.config = config - - class ModernBertAttention(nn.Module): def __init__(self, config: ModernBertConfig, layer_id: int | None = None): super().__init__() @@ -95,19 +82,33 @@ class ModernBertAttention(nn.Module): bias=config.attention_bias, ) - sliding_window = None - if layer_id % config.global_attn_every_n_layers != 0: - sliding_window = config.local_attention // 2 - rope_theta = ( - config.local_rope_theta - if config.local_rope_theta is not None - else config.global_rope_theta - ) + if layer_types := getattr(config, "layer_types", None): + # Transformers v5 + layer_type = layer_types[layer_id] + rope_parameters = config.rope_parameters[layer_type] + sliding_window: int | None = None + if layer_type == "sliding_attention": + sliding_window = config.local_attention // 2 else: - rope_theta = config.global_rope_theta + # Transformers v4 + sliding_window = None + if layer_id % config.global_attn_every_n_layers != 0: + sliding_window = config.local_attention // 2 + rope_theta = ( + config.local_rope_theta + if config.local_rope_theta is not None + else config.global_rope_theta + ) + else: + rope_theta = config.global_rope_theta + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} - self.rotary_emb = ModernBertRotaryEmbedding( - config=config, head_size=self.head_dim, dim=self.head_dim, base=rope_theta + self.rotary_emb = get_rope( + head_size=self.head_dim, + rotary_dim=self.head_dim, + max_position=config.max_position_embeddings, + rope_parameters=rope_parameters, + dtype=torch.float16, ) self.attn = EncoderOnlyAttention( self.num_heads, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 34c31d8deee23..f5501bae78418 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -503,7 +503,7 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config + config = vllm_config.model_config.hf_config.get_text_config() quant_config = vllm_config.quant_config self.config = config From 0037b5746a4319c851eed2bc0b24912e179a851c Mon Sep 17 00:00:00 2001 From: Wushi Dong <33078715+wushidonguc@users.noreply.github.com> Date: Mon, 1 Dec 2025 23:08:07 -0800 Subject: [PATCH 15/45] [Core] Eliminate redundant is_encoder_decoder lookups (20-40us/step) (#29800) Signed-off-by: Wushi Dong --- vllm/v1/worker/gpu_model_runner.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9eacd2138978b..ee28f477a26ad 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2439,16 +2439,13 @@ class GPUModelRunner( ]: num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens is_first_rank = get_pp_group().is_first_rank + is_encoder_decoder = self.model_config.is_encoder_decoder # _prepare_inputs may reorder the batch, so we must gather multi # modal outputs after that to ensure the correct order ec_connector_output = None - if ( - self.supports_mm_inputs - and is_first_rank - and not self.model_config.is_encoder_decoder - ): + if self.supports_mm_inputs and is_first_rank and not is_encoder_decoder: # Run the multimodal encoder if any. with self.maybe_get_ec_connector_output( scheduler_output, @@ -2526,10 +2523,7 @@ class GPUModelRunner( num_input_tokens, intermediate_tensors, True ) - if ( - self.model_config.is_encoder_decoder - and scheduler_output.scheduled_encoder_inputs - ): + if is_encoder_decoder and scheduler_output.scheduled_encoder_inputs: # Run the encoder, just like we do with other multimodal inputs. # For an encoder-decoder model, our processing here is a bit # simpler, because the outputs are just passed to the decoder. From 3b221cb661676fb2e9ae791ede9cb7f5cf4752d1 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Mon, 1 Dec 2025 23:49:16 -0800 Subject: [PATCH 16/45] [BugFix] respect VLLM_LOGGING_LEVEL in logger (#29761) Signed-off-by: Boyuan Feng --- tests/conftest.py | 1 + tests/test_config.py | 4 ++-- tests/test_logger.py | 4 ++-- vllm/logger.py | 5 ++++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 317b36ba6cb80..53bbaddd0bb7f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1174,6 +1174,7 @@ def caplog_mp_spawn(tmp_path, monkeypatch): "level": level, "filename": log_path.as_posix(), } + config["loggers"]["vllm"]["level"] = level config_path.write_text(json.dumps(config)) diff --git a/tests/test_config.py b/tests/test_config.py index 112b02edd0389..76e0d94425fa6 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -716,7 +716,7 @@ def test_is_chunked_prefill_supported( ): model_config = ModelConfig(model_id, trust_remote_code=True) assert model_config.attn_type == expected_attn_type - with caplog_vllm.at_level(level=logging.DEBUG): + with caplog_vllm.at_level(level=logging.DEBUG, logger="vllm"): assert model_config.is_chunked_prefill_supported == expected_result assert reason in caplog_vllm.text @@ -835,7 +835,7 @@ def test_is_prefix_caching_supported( ): model_config = ModelConfig(model_id, trust_remote_code=True) assert model_config.attn_type == expected_attn_type - with caplog_vllm.at_level(level=logging.DEBUG): + with caplog_vllm.at_level(level=logging.DEBUG, logger="vllm"): assert model_config.is_prefix_caching_supported == expected_result assert reason in caplog_vllm.text diff --git a/tests/test_logger.py b/tests/test_logger.py index 8900e9c2a1e69..b4f44f52d4df9 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -57,7 +57,7 @@ def test_default_vllm_root_logger_configuration(monkeypatch): _configure_vllm_root_logger() logger = logging.getLogger("vllm") - assert logger.level == logging.DEBUG + assert logger.level == logging.INFO assert not logger.propagate handler = logger.handlers[0] @@ -524,7 +524,7 @@ def mp_function(**kwargs): def test_caplog_mp_fork(caplog_vllm, caplog_mp_fork): - with caplog_vllm.at_level(logging.DEBUG), caplog_mp_fork(): + with caplog_vllm.at_level(logging.DEBUG, logger="vllm"), caplog_mp_fork(): import multiprocessing ctx = multiprocessing.get_context("fork") diff --git a/vllm/logger.py b/vllm/logger.py index ad3123c0f0149..3b7bb1f22ec96 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -62,7 +62,7 @@ DEFAULT_LOGGING_CONFIG = { "loggers": { "vllm": { "handlers": ["vllm"], - "level": "DEBUG", + "level": envs.VLLM_LOGGING_LEVEL, "propagate": False, }, }, @@ -175,6 +175,9 @@ def _configure_vllm_root_logger() -> None: vllm_handler["stream"] = envs.VLLM_LOGGING_STREAM vllm_handler["formatter"] = "vllm_color" if _use_color() else "vllm" + vllm_loggers = logging_config["loggers"]["vllm"] + vllm_loggers["level"] = envs.VLLM_LOGGING_LEVEL + if envs.VLLM_LOGGING_CONFIG_PATH: if not path.exists(envs.VLLM_LOGGING_CONFIG_PATH): raise RuntimeError( From 48d15a32aa567dfc59ede46683b01cc2321579cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=B0=E5=85=AE?= <38908462+zhyajie@users.noreply.github.com> Date: Tue, 2 Dec 2025 16:02:12 +0800 Subject: [PATCH 17/45] [CI] Fix Bad_words test for tokenizer encode/decode asymmetry (#28193) Signed-off-by: zhyajie Co-authored-by: zhyajie --- tests/v1/sample/test_sampling_params_e2e.py | 27 ++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py index 1684252174d3d..a75a37befe0e1 100644 --- a/tests/v1/sample/test_sampling_params_e2e.py +++ b/tests/v1/sample/test_sampling_params_e2e.py @@ -106,6 +106,25 @@ def test_detokenize_false(llm): def test_bad_words(llm): """Check that we respect bad words.""" + tokenizer = llm.get_tokenizer() + + def contains_bad_word(text: str, tokens: list[int], bad_word: str) -> bool: + """Check if word appears in BOTH text and token sequence.""" + if bad_word not in text: + return False + + for add_prefix_space in [False, True]: + prefix = " " if add_prefix_space else "" + bad_words_token = tokenizer.encode( + prefix + bad_word.lstrip(), add_special_tokens=False + ) + if not bad_words_token: + continue + for i in range(len(tokens) - len(bad_words_token) + 1): + if tokens[i : i + len(bad_words_token)] == bad_words_token: + return True + return False + output = llm.generate(PROMPT, SamplingParams(temperature=0)) split_text = output[0].outputs[0].text.split() @@ -113,14 +132,16 @@ def test_bad_words(llm): params = SamplingParams(temperature=0, bad_words=[bad_words_1]) output = llm.generate(PROMPT, params) new_text = output[0].outputs[0].text - assert bad_words_1 not in new_text + new_tokens = output[0].outputs[0].token_ids + assert not contains_bad_word(new_text, new_tokens, bad_words_1) bad_words_2 = new_text.split()[-1] params = SamplingParams(temperature=0, bad_words=[bad_words_1, bad_words_2]) output = llm.generate(PROMPT, params) new_text = output[0].outputs[0].text - assert bad_words_1 not in new_text - assert bad_words_2 not in new_text + new_tokens = output[0].outputs[0].token_ids + assert not contains_bad_word(new_text, new_tokens, bad_words_1) + assert not contains_bad_word(new_text, new_tokens, bad_words_2) def test_logits_processor(llm): From 70fb77b4dcc3cfb368831d1aba8e1e2dca7c31a9 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 2 Dec 2025 00:55:02 -0800 Subject: [PATCH 18/45] [BugFix] add max-num-batched-token to scheduler hash (#29829) Signed-off-by: Boyuan Feng --- vllm/config/scheduler.py | 14 ++++++++++++-- vllm/config/vllm.py | 4 ---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 88f3e62fbd4ed..1e089b42cccde 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -175,9 +175,19 @@ class SchedulerConfig: excluding anything before input ids/embeddings and after the final hidden states. """ - # no factors to consider. - # this config will not affect the computation graph. factors: list[Any] = [] + + # max_num_batched_tokens need to be included in the hash due + # to two reasons: + # 1. LoRA creates static buffers based on max_num_batched_tokens. + # The tensor sizes and strides get captured in the torch.compile + # graph explicitly. + # 2. Inductor decides whether using 32-bit or 64-bit indexing integer + # based on the data sizes. `max_num_batched_tokens` has an + # impact on that. For more details, please check + # https://github.com/vllm-project/vllm/issues/29585 + factors.append(self.max_num_batched_tokens) + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 4542866aa166c..615b1f8489eff 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -265,10 +265,6 @@ class VllmConfig: vllm_factors.append("None") if self.lora_config: vllm_factors.append(self.lora_config.compute_hash()) - # LoRA creates static buffers based on max_num_batched_tokens. - # The tensor sizes and strides get captured in the torch.compile - # graph explicitly. - vllm_factors.append(str(self.scheduler_config.max_num_batched_tokens)) else: vllm_factors.append("None") if self.speculative_config: From 8bbcf8b6e7ad0cdeaef010bd834bd723f4e00445 Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Tue, 2 Dec 2025 01:00:23 -0800 Subject: [PATCH 19/45] [vLLM Benchmark Suite] Add default parameters section and update CPU benchmark cases (#29381) Signed-off-by: Tsai, Louie Signed-off-by: Louie Tsai Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Li, Jiang --- .buildkite/performance-benchmarks/README.md | 59 + .../scripts/run-performance-benchmarks.sh | 48 +- .../tests/serving-tests-cpu-snc2.json | 610 ---------- .../tests/serving-tests-cpu-snc3.json | 1023 ----------------- .../tests/serving-tests-cpu.json | 516 ++++----- docs/getting_started/installation/cpu.md | 29 + 6 files changed, 374 insertions(+), 1911 deletions(-) delete mode 100644 .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json delete mode 100644 .buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json diff --git a/.buildkite/performance-benchmarks/README.md b/.buildkite/performance-benchmarks/README.md index 6d494f64f14fa..015f48c2520d6 100644 --- a/.buildkite/performance-benchmarks/README.md +++ b/.buildkite/performance-benchmarks/README.md @@ -108,6 +108,65 @@ The number of this test is less stable compared to the delay and latency benchma WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. +#### Default Parameters Field + +We can specify default parameters in a JSON field with key `defaults`. Parameters defined in the field are applied globally to all serving tests, and can be overridden in test case fields. Here is an example: + +
+ An Example of default parameters field + +```json +{ + "defaults": { + "qps_list": [ + "inf" + ], + "server_environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "server_parameters": { + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "block_size": 128, + "disable_log_stats": "", + "load_format": "dummy" + }, + "client_parameters": { + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "num_prompts": 200, + "ignore-eos": "" + } + }, + "tests": [ + { + "test_name": "serving_llama3B_tp2_random_128_128", + "server_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "tensor_parallel_size": 2, + }, + "client_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + } + }, + { + "test_name": "serving_qwen3_tp4_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-14B", + "tensor_parallel_size": 4, + }, + "client_parameters": { + "model": "Qwen/Qwen3-14B", + } + }, + ] +} +``` + +
+ ### Visualizing the results The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh index 99a5a5e334f8e..34ceefe0996f2 100644 --- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -110,7 +110,8 @@ json2envs() { wait_for_server() { # wait for vllm server to start # return 1 if vllm server crashes - timeout 1200 bash -c ' + local timeout_val="1200" + timeout "$timeout_val" bash -c ' until curl -X POST localhost:8000/v1/completions; do sleep 1 done' && return 0 || return 1 @@ -316,12 +317,44 @@ run_throughput_tests() { run_serving_tests() { # run serving tests using `vllm bench serve` command # $1: a json file specifying serving test cases + # + # Supported JSON formats: + # 1) Plain format: top-level array + # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] + # + # 2) Default parameters field + plain format tests + # { + # "defaults": { ... }, + # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] + # } local serving_test_file serving_test_file=$1 # Iterate over serving tests - jq -c '.[]' "$serving_test_file" | while read -r params; do + jq -c ' + if type == "array" then + # Plain format: test cases array + .[] + elif (type == "object" and has("tests")) then + # merge the default parameters into each test cases + . as $root + | ($root.defaults // {}) as $d + | ($root.tests // [])[] + # default qps / max_concurrency from defaults if missing + | .qps_list = (.qps_list // $d.qps_list) + | .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list) + # merge envs / params: test overrides defaults + | .server_environment_variables = + (($d.server_environment_variables // {}) + (.server_environment_variables // {})) + | .server_parameters = + (($d.server_parameters // {}) + (.server_parameters // {})) + | .client_parameters = + (($d.client_parameters // {}) + (.client_parameters // {})) + else + error("Unsupported serving test file format: must be array or object with .tests") + end + ' "$serving_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. test_name=$(echo "$params" | jq -r '.test_name') if [[ ! "$test_name" =~ ^serving_ ]]; then @@ -335,20 +368,25 @@ run_serving_tests() { continue fi - # get client and server arguments + # get client and server arguments (after merged the default parameters) server_params=$(echo "$params" | jq -r '.server_parameters') server_envs=$(echo "$params" | jq -r '.server_environment_variables') client_params=$(echo "$params" | jq -r '.client_parameters') + server_args=$(json2args "$server_params") server_envs=$(json2envs "$server_envs") client_args=$(json2args "$client_params") + + # qps_list qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') echo "Running over qps list $qps_list" + + # max_concurrency_list (fallback to num_prompts if missing) max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list') if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then - num_prompts=$(echo "$client_params" | jq -r '.num_prompts') - max_concurrency_list="[$num_prompts]" + num_prompts=$(echo "$client_params" | jq -r '.num_prompts') + max_concurrency_list="[$num_prompts]" fi max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh') echo "Running over max concurrency list $max_concurrency_list" diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json deleted file mode 100644 index f758097e098e4..0000000000000 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc2.json +++ /dev/null @@ -1,610 +0,0 @@ -[ - { - "test_name": "serving_llama8B_bf16_tp1_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_tp2_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_tp4_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_tp1_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_bf16_tp2_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_bf16_tp4_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_tp1_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_tp2_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_tp4_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_tp1_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_tp2_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_tp4_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_tp1_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_tp2_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_tp4_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_tp1_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_tp2_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_tp4_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - } -] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json deleted file mode 100644 index 0b1a42e790255..0000000000000 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-snc3.json +++ /dev/null @@ -1,1023 +0,0 @@ -[ - { - "test_name": "serving_llama8B_bf16_pp1_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "pipeline_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_tp2_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_pp3_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_tp4_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_bf16_pp1_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "pipeline_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_bf16_tp2_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_bf16_pp3_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_bf16_tp4_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_pp1_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "pipeline_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_tp2_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_pp3_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_tp4_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_tp2pp3_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2, - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int8_pp1_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "pipeline_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_tp2_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_pp3_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_tp4_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int8_tp2pp3_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "tensor_parallel_size": 2, - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_pp1_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "pipeline_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_tp2_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_pp3_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_tp4_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_tp2pp3_sharegpt", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 2, - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_int4_pp1_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "pipeline_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_tp2_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_pp3_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_tp4_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - }, - { - "test_name": "serving_llama8B_int4_tp2pp3_random_128_128", - "qps_list": ["inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "quantization": "awq", - "tensor_parallel_size": 2, - "pipeline_parallel_size": 3, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 1000 - } - } -] diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json index f792956f39472..8f7200862d20c 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json @@ -1,276 +1,246 @@ -[ - { - "test_name": "serving_llama8B_tp1_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 32 - } +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 }, - { - "test_name": "serving_llama8B_tp2_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 32 - } + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" }, - { - "test_name": "serving_llama8B_tp1_random_128_128", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 32 - } - }, - { - "test_name": "serving_llama8B_tp2_random_128_128", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 32 - } - }, - { - "test_name": "serving_llama8B_tp1_random_128_2048", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 2048, - "ignore-eos": "", - "num_prompts": 32 - } - }, - { - "test_name": "serving_llama8B_tp2_random_128_2048", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 2048, - "ignore-eos": "", - "num_prompts": 32 - } - }, - { - "test_name": "serving_llama8B_tp1_random_2048_128", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 2048, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 32 - } - }, - { - "test_name": "serving_llama8B_tp2_random_2048_128", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [32], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 2, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 2048, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 32 - } + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "ignore-eos": "", + "num_prompts": 200 } -] + }, + "tests": [ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp4_random_128_128", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp4_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp1_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp4_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama3B_tp1_random_128_128", + "server_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_granite2B_tp1_random_128_128", + "server_parameters": { + "model": "ibm-granite/granite-3.2-2b-instruct", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "ibm-granite/granite-3.2-2b-instruct", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen1.7B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-1.7B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-1.7B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen4B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-4B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-4B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen8B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-8B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_glm9B_tp1_random_128_128", + "server_parameters": { + "model": "zai-org/glm-4-9b-hf", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "zai-org/glm-4-9b-hf", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_gemma7B_tp1_random_128_128", + "server_parameters": { + "model": "google/gemma-7b", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "google/gemma-7b", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + } + ] +} diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index 18dc6d19434b3..4b68cb4811789 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -138,6 +138,35 @@ vllm serve facebook/opt-125m --dtype=bfloat16 Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`. +### What are supported models on CPU? + +For the full and up-to-date list of models validated on CPU platforms, please see the official documentation: [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu) + +### How to find benchmark configuration examples for supported CPU models? + +For any model listed under [Supported Models on CPU](https://docs.vllm.ai/en/latest/models/hardware_supported_models/cpu), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](https://github.com/vllm-project/vllm/blob/main/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json) +For details on how these optimized configurations are determined, see: [performance-benchmark-details](https://github.com/vllm-project/vllm/tree/main/.buildkite/performance-benchmarks#performance-benchmark-details). +To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](https://docs.vllm.ai/en/latest/contributing/benchmarks/#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment. + +Below is an example command to benchmark all CPU-supported models using optimized configurations. + +```bash +ON_CPU=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +``` + +The benchmark results will be saved in `./benchmark/results/`. +In the directory, the generated `.commands` files contain all example commands for the benchmark. + +We recommend configuring tensor-parallel-size to match the number of NUMA nodes on your system. Note that the current release does not support tensor-parallel-size=6. +To determine the number of NUMA nodes available, use the following command: + +```bash +lscpu | grep "NUMA node(s):" | awk '{print $3}' +``` + +For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu) +, which publishes default-model CPU results produced using the same Benchmark Suite. + ### How to decide `VLLM_CPU_OMP_THREADS_BIND`? - Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following. From d8c6210eeaa7f3b474e50cf74926f77a8dc79adf Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Tue, 2 Dec 2025 11:29:00 +0100 Subject: [PATCH 20/45] Add Mistral Large 3 and Ministral 3 (#29757) Signed-off-by: Julien Denize Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com> Signed-off-by: Mickael Seznec Signed-off-by: Roger Wang Co-authored-by: Roger Wang Co-authored-by: Mickael Seznec --- docs/models/supported_models.md | 5 +- tests/models/registry.py | 14 ++ tests/tokenizers_/test_mistral.py | 158 ++++++++++++++++- vllm/config/speculative.py | 4 + .../tool_parsers/mistral_tool_parser.py | 2 +- ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++ vllm/model_executor/layers/mla.py | 4 + .../layers/rotary_embedding/__init__.py | 2 +- vllm/model_executor/models/deepseek_v2.py | 66 ++++++- vllm/model_executor/models/mistral_large_3.py | 63 +++++++ .../models/mistral_large_3_eagle.py | 165 ++++++++++++++++++ vllm/model_executor/models/registry.py | 5 + vllm/tokenizers/mistral.py | 36 ++++ vllm/transformers_utils/configs/eagle.py | 6 + vllm/transformers_utils/configs/mistral.py | 74 ++++++-- vllm/v1/spec_decode/eagle.py | 4 + 16 files changed, 724 insertions(+), 30 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json create mode 100644 vllm/model_executor/models/mistral_large_3.py create mode 100644 vllm/model_executor/models/mistral_large_3_eagle.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index da7c5edf66bfb..6ea2285b92bb8 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -417,7 +417,8 @@ th { | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | | `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ | -| `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | +| `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | +| `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ | | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | @@ -711,7 +712,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | | `Phi4MultimodalForCausalLM` | Phi-4-multimodal (HF Transformers) | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct` (with revision `refs/pr/70`), etc. | ✅︎ | ✅︎ | -| `PixtralForConditionalGeneration` | Mistral 3 (Mistral format), Pixtral (Mistral format) | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Pixtral-12B-2409`, etc. | | ✅︎ | +| `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I+ | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ | | `QwenVLForConditionalGeneration`^ | Qwen-VL | T + IE+ | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index d90f3a4d4f781..26351089fc464 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -358,6 +358,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True, ), "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"), + "MistralLarge3ForCausalLM": _HfExamplesInfo( + "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", is_available_online=False + ), "MixtralForCausalLM": _HfExamplesInfo( "mistralai/Mixtral-8x7B-Instruct-v0.1", {"tiny": "TitanML/tiny-mixtral"}, @@ -770,7 +773,13 @@ _MULTIMODAL_EXAMPLE_MODELS = { ), "PixtralForConditionalGeneration": _HfExamplesInfo( "mistralai/Pixtral-12B-2409", + extras={ + "mistral-large-3": "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", + "ministral-3": "mistralai/Ministral-3-3B-Instruct-2512", + }, tokenizer_mode="mistral", + # TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available. + is_available_online=False, ), "QwenVLForConditionalGeneration": _HfExamplesInfo( "Qwen/Qwen-VL", @@ -870,6 +879,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { use_original_num_layers=True, max_model_len=10240, ), + "EagleMistralLarge3ForCausalLM": _HfExamplesInfo( + "mistralai/Mistral-Large-3-675B-Instruct-2512", + speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle", + is_available_online=False, + ), "LlamaForCausalLMEagle3": _HfExamplesInfo( "Qwen/Qwen3-8B", trust_remote_code=True, diff --git a/tests/tokenizers_/test_mistral.py b/tests/tokenizers_/test_mistral.py index 92efac86dff29..faff611502652 100644 --- a/tests/tokenizers_/test_mistral.py +++ b/tests/tokenizers_/test_mistral.py @@ -91,6 +91,118 @@ from vllm.tokenizers.mistral import ( ], ), ), + ( + { + "messages": [ + { + "role": "user", + "content": "What is the current local date and time?", + } + ], + "tools": [ + { + "type": "function", + "function": { + "description": "Fetch the current local date and time.", + "unsupported_field": False, + "name": "get_current_time", + "parameters": {}, + }, + }, + { + "type": "function", + "function": { + "description": "Fetch the current local date and time.", + "unsupported_field2": False, + "name": "get_current_time", + "parameters": {}, + }, + }, + ], + }, + ( + [ + { + "role": "user", + "content": "What is the current local date and time?", + } + ], + [ + { + "type": "function", + "function": { + "description": "Fetch the current local date and time.", + "name": "get_current_time", + "parameters": {}, + }, + }, + { + "type": "function", + "function": { + "description": "Fetch the current local date and time.", + "name": "get_current_time", + "parameters": {}, + }, + }, + ], + ), + ), + ( + { + "messages": [ + { + "role": "user", + "content": "What is the current local date and time?", + } + ], + "tools": [ + { + "type": "function", + "unsupported_field": False, + "function": { + "description": "Fetch the current local date and time.", + "name": "get_current_time", + "parameters": {}, + }, + }, + { + "type": "function", + "unsupported_field2": False, + "function": { + "description": "Fetch the current local date and time 2.", + "name": "get_current_time2", + "parameters": {"a": "1"}, + }, + }, + ], + }, + ( + [ + { + "role": "user", + "content": "What is the current local date and time?", + } + ], + [ + { + "type": "function", + "function": { + "description": "Fetch the current local date and time.", + "name": "get_current_time", + "parameters": {}, + }, + }, + { + "type": "function", + "function": { + "description": "Fetch the current local date and time 2.", + "name": "get_current_time2", + "parameters": {"a": "1"}, + }, + }, + ], + ), + ), ], ) def test_prepare_apply_chat_template_tools_and_messages( @@ -1108,13 +1220,6 @@ class TestMistralTokenizer: ) == expected_tokens[mistral_tokenizer.is_tekken] ) - assert ( - mistral_tokenizer.decode( - ids[mistral_tokenizer.is_tekken], - skip_special_tokens=skip_special_tokens, - ) - == expected_tokens[mistral_tokenizer.is_tekken] - ) def test_decode_empty( self, @@ -1140,6 +1245,45 @@ class TestMistralTokenizer: == "" ) + @pytest.mark.parametrize( + "skip_special_tokens,expected_tokens", + ( + ( + False, + ( + ["[INST]▁Hello▁world▁![/INST]▁Hello"], + ["[INST]Hello world ![/INST]Hello"], + ), + ), + (True, (["Hello world ! Hello"], ["Hello world !Hello"])), + ), + ) + def test_batch_decode( + self, + mistral_tokenizer: MistralTokenizer, + skip_special_tokens: bool, + expected_tokens: tuple[str, str], + ): + ids = ( + [[1, 3, 23325, 2294, 1686, 4, 23325, 2]], + [[1, 3, 22177, 4304, 2662, 4, 22177, 2]], + ) + assert ( + mistral_tokenizer.batch_decode( + ids[mistral_tokenizer.is_tekken], + skip_special_tokens=skip_special_tokens, + ) + == expected_tokens[mistral_tokenizer.is_tekken] + ) + + def test_batch_decode_empty( + self, + mistral_tokenizer: MistralTokenizer, + ): + assert mistral_tokenizer.batch_decode( + [[]], + ) == [""] + def test_convert_tokens_to_string(self, mistral_tokenizer: MistralTokenizer): tokens = ( [ diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 80d53a543f149..c6d6f705f535c 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -167,6 +167,7 @@ class SpeculativeConfig: @staticmethod def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: + initial_architecture = hf_config.architectures[0] if hf_config.model_type in ("deepseek_v3", "deepseek_v32"): hf_config.model_type = "deepseek_mtp" if hf_config.model_type == "deepseek_mtp": @@ -226,6 +227,9 @@ class SpeculativeConfig: {"n_predict": n_predict, "architectures": ["LongCatFlashMTPModel"]} ) + if initial_architecture == "MistralLarge3ForCausalLM": + hf_config.update({"architectures": ["EagleMistralLarge3ForCausalLM"]}) + return hf_config def __post_init__(self): diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index 7e2d67a1fb659..89b882d6c8475 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -80,7 +80,7 @@ class MistralToolParser(ToolParser): self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL) if _is_fn_name_regex_support(self.model_tokenizer): self.fn_name_regex = re.compile( - r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)", re.DOTALL + r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)?", re.DOTALL ) else: self.fn_name_regex = None diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..a9f24c20a25a2 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py index 6ebfa47a9dc3f..dad960160f2ad 100644 --- a/vllm/model_executor/layers/mla.py +++ b/vllm/model_executor/layers/mla.py @@ -111,6 +111,7 @@ class MultiHeadLatentAttentionWrapper(CustomOp): self, positions: torch.Tensor, hidden_states: torch.Tensor, + llama_4_scaling: torch.Tensor | None = None, ) -> torch.Tensor: q_c = None kv_lora = None @@ -159,6 +160,9 @@ class MultiHeadLatentAttentionWrapper(CustomOp): hidden_states, q_c, positions, self.indexer_rope_emb ) + if llama_4_scaling is not None: + q *= llama_4_scaling + attn_out = self.mla_attn( q, kv_c_normed, diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index 0f10bff6ac4f5..aa6ece30026d3 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -238,7 +238,7 @@ def get_rope( dtype, **extra_kwargs, ) - elif scaling_type == "deepseek_yarn": + elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]: scaling_factor = rope_parameters["factor"] original_max_position = rope_parameters["original_max_position_embeddings"] # assert max_position == original_max_position * scaling_factor diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 73cac2556c55a..d8a081af125c5 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -395,6 +395,16 @@ def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: return 0.1 * mscale * math.log(scale) + 1.0 +def _get_llama_4_scaling( + original_max_position_embeddings: int, scaling_beta: float, positions: torch.Tensor +) -> torch.Tensor: + scaling = 1 + scaling_beta * torch.log( + 1 + torch.floor(positions / original_max_position_embeddings) + ) + # Broadcast over num_heads and head_dim + return scaling[..., None, None] + + class DeepseekV2Attention(nn.Module): def __init__( self, @@ -481,7 +491,11 @@ class DeepseekV2Attention(nn.Module): prefix=f"{prefix}.o_proj", ) if config.rope_parameters["rope_type"] != "default": - config.rope_parameters["rope_type"] = "deepseek_yarn" + config.rope_parameters["rope_type"] = ( + "deepseek_yarn" + if config.rope_parameters.get("apply_yarn_scaling", True) + else "deepseek_llama_scaling" + ) self.rotary_emb = get_rope( qk_rope_head_dim, @@ -491,7 +505,10 @@ class DeepseekV2Attention(nn.Module): is_neox_style=False, ) - if config.rope_parameters["rope_type"] != "default": + if ( + config.rope_parameters["rope_type"] != "default" + and config.rope_parameters["rope_type"] == "deepseek_yarn" + ): mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False) scaling_factor = config.rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) @@ -511,6 +528,7 @@ class DeepseekV2Attention(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, + llama_4_scaling: torch.Tensor | None, ) -> torch.Tensor: if self.q_lora_rank is not None: q = self.q_a_proj(hidden_states)[0] @@ -536,6 +554,11 @@ class DeepseekV2Attention(nn.Module): k = torch.empty_like(q) k[..., : self.qk_nope_head_dim] = k_nope k[..., self.qk_nope_head_dim :] = k_pe + + # Apply llama 4 scaling if provided + if llama_4_scaling is not None: + q *= llama_4_scaling + # padding value to qk_head_dim for alignment v = torch.nn.functional.pad( v, [0, self.qk_head_dim - self.v_head_dim], value=0 @@ -987,7 +1010,12 @@ class DeepseekV2MLAAttention(nn.Module): ) if config.rope_parameters["rope_type"] != "default": - config.rope_parameters["rope_type"] = "deepseek_yarn" + config.rope_parameters["rope_type"] = ( + "deepseek_yarn" + if config.rope_parameters.get("apply_yarn_scaling", True) + else "deepseek_llama_scaling" + ) + self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, @@ -995,7 +1023,11 @@ class DeepseekV2MLAAttention(nn.Module): rope_parameters=config.rope_parameters, is_neox_style=False, ) - if config.rope_parameters["rope_type"] != "default": + + if ( + config.rope_parameters["rope_type"] != "default" + and config.rope_parameters["rope_type"] == "deepseek_yarn" + ): mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False) scaling_factor = config.rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) @@ -1064,8 +1096,9 @@ class DeepseekV2MLAAttention(nn.Module): self, positions: torch.Tensor, hidden_states: torch.Tensor, + llama_4_scaling: torch.Tensor | None, ) -> torch.Tensor: - return self.mla_attn(positions, hidden_states) + return self.mla_attn(positions, hidden_states, llama_4_scaling) class DeepseekV2DecoderLayer(nn.Module): @@ -1155,6 +1188,7 @@ class DeepseekV2DecoderLayer(nn.Module): positions: torch.Tensor, hidden_states: torch.Tensor, residual: torch.Tensor | None, + llama_4_scaling: torch.Tensor | None = None, ) -> torch.Tensor: # Self Attention if residual is None: @@ -1165,6 +1199,7 @@ class DeepseekV2DecoderLayer(nn.Module): hidden_states = self.self_attn( positions=positions, hidden_states=hidden_states, + llama_4_scaling=llama_4_scaling, ) if ( @@ -1266,8 +1301,24 @@ class DeepseekV2Model(nn.Module): hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] + # Compute llama 4 scaling once per forward pass if enabled + llama_4_scaling_config = getattr(self.config, "llama_4_scaling", None) + llama_4_scaling: torch.Tensor | None + if llama_4_scaling_config is not None: + llama_4_scaling = _get_llama_4_scaling( + original_max_position_embeddings=llama_4_scaling_config[ + "original_max_position_embeddings" + ], + scaling_beta=llama_4_scaling_config["beta"], + positions=positions, + ) + else: + llama_4_scaling = None + for layer in islice(self.layers, self.start_layer, self.end_layer): - hidden_states, residual = layer(positions, hidden_states, residual) + hidden_states, residual = layer( + positions, hidden_states, residual, llama_4_scaling + ) if not get_pp_group().is_last_rank: return IntermediateTensors( @@ -1325,6 +1376,7 @@ class DeepseekV2ForCausalLM( packed_modules_mapping = { "gate_up_proj": ["gate_proj", "up_proj"], } + model_cls = DeepseekV2Model def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -1355,7 +1407,7 @@ class DeepseekV2ForCausalLM( "kv_a_proj_with_mqa", ] - self.model = DeepseekV2Model( + self.model = self.model_cls( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") ) if get_pp_group().is_last_rank: diff --git a/vllm/model_executor/models/mistral_large_3.py b/vllm/model_executor/models/mistral_large_3.py new file mode 100644 index 0000000000000..ff7e9b60c1d3c --- /dev/null +++ b/vllm/model_executor/models/mistral_large_3.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable + +import regex as re +import torch + +from vllm.model_executor.models.deepseek_v2 import DeepseekV3ForCausalLM + + +class MistralLarge3ForCausalLM(DeepseekV3ForCausalLM): + # fmt: off + remapping = { + r"layers\.(\d+)\.attention_norm\.weight": r"model.layers.\1.input_layernorm.weight", # noqa: E501 + r"layers\.(\d+)\.attention\.wq_a\.(\w+)": r"model.layers.\1.self_attn.q_a_proj.\2", # noqa: E501 + r"layers\.(\d+)\.attention\.q_a_norm\.weight": r"model.layers.\1.self_attn.q_a_layernorm.weight", # noqa: E501 + r"layers\.(\d+)\.attention\.wq_b\.(\w+)": r"model.layers.\1.self_attn.q_b_proj.\2", # noqa: E501 + r"layers\.(\d+)\.attention\.wkv_a_with_mqa\.(\w+)": r"model.layers.\1.self_attn.kv_a_proj_with_mqa.\2", # noqa: E501 + r"layers\.(\d+)\.attention\.kv_a_norm\.weight": r"model.layers.\1.self_attn.kv_a_layernorm.weight", # noqa: E501 + r"layers\.(\d+)\.attention\.wkv_b\.(\w+)": r"model.layers.\1.self_attn.kv_b_proj.\2", # noqa: E501 + r"layers\.(\d+)\.attention\.wo\.(\w+)": r"model.layers.\1.self_attn.o_proj.\2", # noqa: E501 + r"layers\.(\d+)\.ffn_norm\.weight": r"model.layers.\1.post_attention_layernorm.weight", # noqa: E501 + r"layers\.(\d+)\.feed_forward\.w1\.(\w+)": r"model.layers.\1.mlp.gate_proj.\2", # noqa: E501 + r"layers\.(\d+)\.feed_forward\.w2\.(\w+)": r"model.layers.\1.mlp.down_proj.\2", # noqa: E501 + r"layers\.(\d+)\.feed_forward\.w3\.(\w+)": r"model.layers.\1.mlp.up_proj.\2", # noqa: E501 + r"layers\.(\d+)\.gate\.weight": r"model.layers.\1.mlp.gate.weight", # noqa: E501 + r"layers\.(\d+)\.shared_experts\.w1\.(\w+)": r"model.layers.\1.mlp.shared_experts.gate_proj.\2", # noqa: E501 + r"layers\.(\d+)\.shared_experts\.w2\.(\w+)": r"model.layers.\1.mlp.shared_experts.down_proj.\2", # noqa: E501 + r"layers\.(\d+)\.shared_experts\.w3\.(\w+)": r"model.layers.\1.mlp.shared_experts.up_proj.\2", # noqa: E501 + r"layers\.(\d+)\.experts\.(\d+)\.w1\.(\w+)": r"model.layers.\1.mlp.experts.\2.gate_proj.\3", # noqa: E501 + r"layers\.(\d+)\.experts\.(\d+)\.w2\.(\w+)": r"model.layers.\1.mlp.experts.\2.down_proj.\3", # noqa: E501 + r"layers\.(\d+)\.experts\.(\d+)\.w3\.(\w+)": r"model.layers.\1.mlp.experts.\2.up_proj.\3", # noqa: E501 + r"norm\.weight": "model.norm.weight", # noqa: E501 + r"tok_embeddings\.weight": "model.embed_tokens.weight", # noqa: E501 + r"output\.weight": "lm_head.weight", # noqa: E501 + } + # fmt: on + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + return super().load_weights(map(self._remap_mistral_to_ds, weights)) + + def _remap_mistral_to_ds( + self, weight: tuple[str, torch.Tensor] + ) -> tuple[str, torch.Tensor]: + """Remap Mistral parameters to DeepseekV2 parameters.""" + name, loaded_weight = weight + + for k, v in self.remapping.items(): + match = re.fullmatch(k, name) + if match: + name = re.sub(k, v, name) + break + else: + raise ValueError(f"Cannot remap {name}") + + # Remapping scale names. We could do this in the regex above but it + # would triple the number of lines for most layers. + if name.endswith(".qscale_act"): + name = re.sub(r"\.qscale_act$", ".input_scale", name) + elif name.endswith(".qscale_weight"): + name = re.sub(r"\.qscale_weight$", ".weight_scale", name) + + return name, loaded_weight diff --git a/vllm/model_executor/models/mistral_large_3_eagle.py b/vllm/model_executor/models/mistral_large_3_eagle.py new file mode 100644 index 0000000000000..e3ca9e4ca82d0 --- /dev/null +++ b/vllm/model_executor/models/mistral_large_3_eagle.py @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Iterable +from functools import partial + +import torch +import torch.nn as nn + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed.parallel_state import get_pp_group +from vllm.logger import init_logger +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.models.deepseek_v2 import ( + DeepseekV2DecoderLayer, + DeepseekV2Model, +) +from vllm.model_executor.models.interfaces import MultiModalEmbeddings +from vllm.model_executor.models.mistral_large_3 import MistralLarge3ForCausalLM +from vllm.multimodal.inputs import NestedTensors + +from .utils import ( + _merge_multimodal_embeddings, + make_empty_intermediate_tensors_factory, + maybe_prefix, +) + +logger = init_logger(__name__) + + +@support_torch_compile +class EagleMistralLarge3Model(DeepseekV2Model): + def __init__( + self, *, vllm_config: VllmConfig, prefix: str = "", start_layer_id: int = 0 + ): + nn.Module.__init__(self) + + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.vllm_config = vllm_config + + self.vocab_size = config.vocab_size + + assert get_pp_group().world_size == 1 + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", + ) + + self.layers = nn.ModuleList( + [ + DeepseekV2DecoderLayer( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"), + ) + for i in range(self.config.num_hidden_layers) + ] + ) + self.start_layer = 0 + self.end_layer = self.config.num_hidden_layers + + self.fc = RowParallelLinear( + self.config.hidden_size * 2, + self.config.hidden_size, + bias=False, + input_is_parallel=False, + quant_config=quant_config, + return_bias=False, + ) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor: + if inputs_embeds is None: + inputs_embeds = self.embed_input_ids(input_ids) + inputs_embeds = self.fc(torch.cat((inputs_embeds, hidden_states), dim=-1)) + output = super().forward( + input_ids, positions, intermediate_tensors=None, inputs_embeds=inputs_embeds + ) + assert isinstance(output, torch.Tensor) + return output + + +class EagleMistralLarge3ForCausalLM(MistralLarge3ForCausalLM): + remapping = MistralLarge3ForCausalLM.remapping | { + r"eagle_linear\.weight": r"model.fc.weight", + r"eagle_linear\.qscale_act": r"model.fc.input_scale", + r"eagle_linear\.qscale_weight": r"model.fc.weight_scale", + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + target_layer_num = vllm_config.model_config.get_num_layers( + vllm_config.parallel_config + ) + vllm_config.model_config = vllm_config.speculative_config.draft_model_config + # draft model quantization config may differ from target model + self.quant_config = VllmConfig.get_quantization_config( + vllm_config.speculative_config.draft_model_config, vllm_config.load_config + ) + vllm_config.quant_config = self.quant_config + self.model_cls = partial( + EagleMistralLarge3Model, start_layer_id=target_layer_num + ) + super().__init__(vllm_config=vllm_config, prefix=prefix) + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings | None = None, + *, + is_multimodal: torch.Tensor | None = None, + handle_oov_mm_token: bool = False, + ) -> torch.Tensor: + inputs_embeds = super().embed_input_ids(input_ids) + + if multimodal_embeddings is None or len(multimodal_embeddings) == 0: + return inputs_embeds + + assert is_multimodal is not None + + return _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + inputs_embeds: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + hidden_states = self.model(input_ids, positions, hidden_states, inputs_embeds) + return hidden_states, hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + # Pretend we've loaded the embedding and lm_head weights + # (later copied from target model) + return super().load_weights(weights) | { + "model.embed_tokens.weight", + "lm_head.weight", + } + + def embed_input_ids( + self, + input_ids: torch.Tensor, + multimodal_embeddings: NestedTensors | None = None, + is_multimodal: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 73a61f1148b50..d3b6268e7647b 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -145,6 +145,7 @@ _TEXT_GENERATION_MODELS = { "MiniMaxM1ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"), "MiniMaxM2ForCausalLM": ("minimax_m2", "MiniMaxM2ForCausalLM"), "MistralForCausalLM": ("llama", "LlamaForCausalLM"), + "MistralLarge3ForCausalLM": ("mistral_large_3", "MistralLarge3ForCausalLM"), "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), # transformers's mpt class has lower case "MptForCausalLM": ("mpt", "MPTForCausalLM"), @@ -424,6 +425,10 @@ _SPECULATIVE_DECODING_MODELS = { "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "Eagle3Qwen2_5vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "Eagle3Qwen3vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), + "EagleMistralLarge3ForCausalLM": ( + "mistral_large_3_eagle", + "EagleMistralLarge3ForCausalLM", + ), "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"), diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 7e6745004b01f..96d1e78ce9f17 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -97,6 +97,8 @@ def _prepare_apply_chat_template_tools_and_messages( continue_final_message: bool = False, add_generation_prompt: bool = False, ) -> tuple[list["ChatCompletionMessageParam"], list[dict[str, Any]] | None]: + from mistral_common.protocol.instruct.tool_calls import Function, Tool + if add_generation_prompt and continue_final_message: raise ValueError( "Cannot set both `add_generation_prompt` and " @@ -139,6 +141,33 @@ def _prepare_apply_chat_template_tools_and_messages( if function.get("description") is None: function["description"] = "" + # We filter not supported arguments to avoid throwing an error. + # TODO(juliendenize): remove this once OpenAI API is better supported by + # `mistral-common`. + tools_fields = set(Tool.model_fields.keys()) + function_fields = set(Function.model_fields.keys()) + for tool in tools: + tool_keys = list(tool.keys()) + for tool_key in tool_keys: + if tool_key not in tools_fields: + tool.pop(tool_key) + logger.warning_once( + f"'{tool_key}' is not supported by mistral-common for tools. " + "It has been poped from the tool definition." + ) + if tool["type"] == "function": + function_keys = list(tool["function"].keys()) + for function_key in function_keys: + if function_key not in function_fields: + tool["function"].pop(function_key) + logger.warning_once( + f"'{function_key}' is not supported by mistral-common " + "for function tools. It has been poped from the " + "function definition." + ) + else: + raise ValueError("mistral-common only supports function tools.") + return messages, tools @@ -410,6 +439,13 @@ class MistralTokenizer(TokenizerLike): ids, skip_special_tokens=skip_special_tokens ) + def batch_decode( + self, ids: list[list[int]] | list[int], skip_special_tokens: bool = False + ) -> str: + return self.transformers_tokenizer.batch_decode( + ids, skip_special_tokens=skip_special_tokens + ) + def convert_tokens_to_string(self, tokens: list[str]) -> str: from mistral_common.tokens.tokenizers.base import ( SpecialTokenPolicy, diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index f5dc9ddfbc575..ce428e567c844 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -82,3 +82,9 @@ class EAGLEConfig(PretrainedConfig): pretrained_model_name_or_path, **kwargs ) return cls.from_dict(config_dict, **kwargs) + + def to_json_string(self, use_diff: bool = True) -> str: + # we override use_diff to False as initializing + # EAGLEConfig with default arguments is not supported + del use_diff + return super().to_json_string(use_diff=False) diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index 966737aad0867..d59169d95f0c9 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -18,9 +18,31 @@ def adapt_config_dict( if bool(config_dict.get("quantization")): config_dict = _remap_mistral_quantization_args(config_dict) + is_moe = bool(config_dict.get("moe")) + is_mistral_large_3 = ( + is_moe and (config_dict["moe"].get("num_shared_experts") or 0) > 0 + ) if config_dict.get("model_type") == "mamba": config_dict["architectures"] = ["Mamba2ForCausalLM"] - elif bool(config_dict.get("moe")): + elif is_moe and is_mistral_large_3: + config_dict = _remap_moe_args(config_dict) + config_dict["model_type"] = "deepseek_v3" + config_dict["architectures"] = ["MistralLarge3ForCausalLM"] + + assert "llama_4_scaling" in config_dict, ( + "MistralLarge3 expect llama4 scaling config." + ) + llama_4_scaling_config_keys = ["original_max_position_embeddings", "beta"] + assert all( + [ + key in config_dict["llama_4_scaling"] + for key in llama_4_scaling_config_keys + ] + ), ( + "llama_4_scaling config should define the keys: " + f"{','.join(llama_4_scaling_config_keys)}" + ) + elif is_moe: config_dict["architectures"] = ["MixtralForCausalLM"] else: config_dict["architectures"] = ["MistralForCausalLM"] @@ -140,17 +162,20 @@ def _remap_general_mistral_args(config: dict) -> dict: def _remap_mistral_quantization_args(config: dict) -> dict: - quantization = config.get("quantization", {}) - if quantization.get("qformat_weight") == "fp8_e4m3": - # This maps to the FP8 static per-tensor quantization scheme - quantization_config = {"quant_method": "fp8", "activation_scheme": "static"} - elif quantization.get("quant_method") == "compressed-tensors": - # Pass through the quantization config to compressed-tensors - quantization_config = quantization - else: - raise ValueError(f"Found unknown quantization='{quantization}' in config") - - config["quantization_config"] = quantization_config + if config.get("quantization"): + quantization = config.pop("quantization", {}) + if quantization.get("qformat_weight") == "fp8_e4m3": + qscheme_act = quantization.get("qscheme_act") + assert qscheme_act in ("NO_SCALES", "TENSOR", None), ( + "Only NO_SCALES and TENSOR (default) are supported for qscheme_act" + ) + is_dynamic = qscheme_act == "NO_SCALES" + config["quantization_config"] = { + "quant_method": "fp8", + "activation_scheme": "dynamic" if is_dynamic else "static", + } + else: + raise ValueError(f"Found unknown quantization='{quantization}' in config") return config @@ -183,3 +208,28 @@ def _remap_mistral_audio_args(config: dict) -> dict: if quant_config: config["quantization_config"] = quant_config return config + + +def _remap_moe_args(config: dict) -> dict: + moe_config_map = { + "route_every_n": "moe_layer_freq", + "first_k_dense_replace": "first_k_dense_replace", + "num_experts_per_tok": "num_experts_per_tok", + "num_experts": "n_routed_experts", + "expert_hidden_dim": "moe_intermediate_size", + "routed_scale": "routed_scaling_factor", + "num_shared_experts": "n_shared_experts", + "num_expert_groups": "n_group", + "num_expert_groups_per_tok": "topk_group", + } + moe_config = config.get("moe", {}) + for old_name, new_name in moe_config_map.items(): + if old_name in moe_config: + value = moe_config.pop(old_name) + config[new_name] = value + + config["topk_method"] = None + config["norm_topk_prob"] = True + config["scoring_func"] = "softmax" + + return config diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index d7111d52dd8a1..1c7845a14b742 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1016,6 +1016,10 @@ class EagleProposer: "Qwen3VLForConditionalGeneration", ]: self.model.config.image_token_index = target_model.config.image_token_id + elif self.get_model_name(target_model) == "PixtralForConditionalGeneration": + self.model.config.image_token_index = ( + target_model.config.vision_config.image_token_id + ) else: self.model.config.image_token_index = ( target_model.config.image_token_index From 951445a52df030050c9a3ed72d612d7e807ba368 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Dec 2025 12:16:37 +0000 Subject: [PATCH 21/45] Remove default values from `InitVar`s so that they're not stored (#29859) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/benchmark_ngram_proposer.py | 5 ++- tests/compile/test_fusion_attn.py | 15 ++++--- tests/lora/test_worker.py | 25 +++++++---- tests/test_config.py | 13 ++++++ tests/v1/attention/utils.py | 2 + tests/v1/core/test_kv_cache_utils.py | 11 ++++- tests/v1/core/test_scheduler.py | 13 +++--- tests/v1/core/utils.py | 15 +++---- tests/v1/cudagraph/test_cudagraph_dispatch.py | 4 +- tests/v1/engine/test_engine_core.py | 13 +++--- tests/v1/kv_connector/unit/utils.py | 13 +++--- tests/v1/spec_decode/test_eagle.py | 5 ++- tests/v1/spec_decode/test_mtp.py | 5 ++- tests/v1/tpu/worker/test_tpu_model_runner.py | 11 ++--- tests/v1/worker/test_gpu_model_runner.py | 20 +++++---- vllm/config/scheduler.py | 42 +++++++++++-------- vllm/config/vllm.py | 4 +- 17 files changed, 139 insertions(+), 77 deletions(-) diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index dedb564fffac8..cac401456b62a 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -108,7 +108,10 @@ def benchmark_batched_propose(args): device_config=DeviceConfig(device=current_platform.device_type), parallel_config=ParallelConfig(), load_config=LoadConfig(), - scheduler_config=SchedulerConfig(), + scheduler_config=SchedulerConfig( + max_model_len=model_config.max_model_len, + is_encoder_decoder=model_config.is_encoder_decoder, + ), ) # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index dbe12dc5de705..4d213e030edb5 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -318,13 +318,18 @@ def test_attention_quant_pattern( torch.set_default_dtype(dtype) torch.manual_seed(42) + model_config = ModelConfig( + model=model_name, + max_model_len=2048, + dtype=dtype, + ) vllm_config = VllmConfig( - model_config=ModelConfig( - model=model_name, - max_model_len=2048, - dtype=dtype, + model_config=model_config, + scheduler_config=SchedulerConfig( + max_num_seqs=1024, + max_model_len=model_config.max_model_len, + is_encoder_decoder=model_config.is_encoder_decoder, ), - scheduler_config=SchedulerConfig(max_num_seqs=1024), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops_list, diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index b163559a9414d..54059ec561907 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -33,14 +33,16 @@ def test_worker_apply_lora(qwen3_lora_files): lora_requests, lora_mapping ) + model_config = ModelConfig( + MODEL_PATH, + seed=0, + dtype="float16", + max_model_len=127, + enforce_eager=True, + ) + vllm_config = VllmConfig( - model_config=ModelConfig( - MODEL_PATH, - seed=0, - dtype="float16", - max_model_len=127, - enforce_eager=True, - ), + model_config=model_config, load_config=LoadConfig( download_dir=None, load_format="dummy", @@ -50,7 +52,14 @@ def test_worker_apply_lora(qwen3_lora_files): tensor_parallel_size=1, data_parallel_size=1, ), - scheduler_config=SchedulerConfig("generate", 32, 32, 32), + scheduler_config=SchedulerConfig( + max_model_len=model_config.max_model_len, + is_encoder_decoder=model_config.is_encoder_decoder, + runner_type="generate", + max_num_batched_tokens=32, + max_num_seqs=32, + max_num_partial_prefills=32, + ), device_config=DeviceConfig("cuda"), cache_config=CacheConfig( block_size=16, diff --git a/tests/test_config.py b/tests/test_config.py index 76e0d94425fa6..b7ed68fea92ab 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -6,12 +6,14 @@ from dataclasses import MISSING, Field, asdict, dataclass, field from unittest.mock import patch import pytest +from pydantic import ValidationError from vllm.compilation.backends import VllmBackend from vllm.config import ( CompilationConfig, ModelConfig, PoolerConfig, + SchedulerConfig, VllmConfig, update_config, ) @@ -1095,3 +1097,14 @@ def test_vllm_config_explicit_overrides(): # Other fields should still use defaults assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE + + +def test_scheduler_config_init(): + with pytest.raises(ValidationError): + # Positional InitVars missing + # (InitVars cannot have defaults otherwise they will become attributes) + SchedulerConfig() + + with pytest.raises(AttributeError): + # InitVar does not become an attribute + print(SchedulerConfig.default_factory().max_model_len) diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index df3d53332c7cd..6cab129c116c5 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -185,6 +185,8 @@ def create_vllm_config( max_num_seqs=max_num_seqs, max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=enable_chunked_prefill, + max_model_len=model_config.max_model_len, + is_encoder_decoder=model_config.is_encoder_decoder, ) device_config = DeviceConfig() diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 58a7a2692bfc8..fd5cf6d3e74aa 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1128,7 +1128,11 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len) dtype="float16", max_model_len=max_model_len, ) - scheduler_config = SchedulerConfig(max_num_batched_tokens=32768) + scheduler_config = SchedulerConfig( + max_num_batched_tokens=32768, + max_model_len=model_config.max_model_len, + is_encoder_decoder=model_config.is_encoder_decoder, + ) vllm_config = VllmConfig( model_config=model_config, @@ -1163,7 +1167,10 @@ def test_get_max_concurrency_for_kv_cache_config(): max_model_len=max_model_len, ) scheduler_config = SchedulerConfig( - max_num_batched_tokens=1024, enable_chunked_prefill=True + max_num_batched_tokens=1024, + enable_chunked_prefill=True, + max_model_len=model_config.max_model_len, + is_encoder_decoder=model_config.is_encoder_decoder, ) vllm_config = VllmConfig( diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 0051c11d18d85..c6c4a5085bff7 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1508,6 +1508,12 @@ def create_scheduler_with_priority( Returns: {class}`Scheduler` instance with priority scheduling """ + model_config = ModelConfig( + model=model, + trust_remote_code=True, + dtype="float16", + seed=42, + ) if max_model_len is None: max_model_len = max_num_batched_tokens scheduler_config = SchedulerConfig( @@ -1517,14 +1523,9 @@ def create_scheduler_with_priority( long_prefill_token_threshold=long_prefill_token_threshold, disable_chunked_mm_input=disable_chunked_mm_input, enable_chunked_prefill=True, + is_encoder_decoder=model_config.is_encoder_decoder, policy="priority", # Enable priority scheduling ) - model_config = ModelConfig( - model=model, - trust_remote_code=True, - dtype="float16", - seed=42, - ) # Cache config, optionally force APC cache_config = CacheConfig( block_size=block_size, diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 7537c7a60476b..f5ba613d38db1 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -69,6 +69,13 @@ def create_scheduler( Returns: {class}`Scheduler` instance """ + model_config = ModelConfig( + model=model, + trust_remote_code=True, + dtype="float16", + seed=42, + skip_tokenizer_init=skip_tokenizer_init, + ) if max_model_len is None: max_model_len = max_num_batched_tokens scheduler_config = SchedulerConfig( @@ -79,13 +86,7 @@ def create_scheduler( disable_chunked_mm_input=disable_chunked_mm_input, enable_chunked_prefill=enable_chunked_prefill, async_scheduling=async_scheduling, - ) - model_config = ModelConfig( - model=model, - trust_remote_code=True, - dtype="float16", - seed=42, - skip_tokenizer_init=skip_tokenizer_init, + is_encoder_decoder=model_config.is_encoder_decoder, ) # Cache config, optionally force APC cache_config = CacheConfig( diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index 314e7094ef97f..b86534d3d4381 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -40,7 +40,9 @@ def _create_vllm_config( ) -> MagicMock: mock_config = MagicMock(spec=VllmConfig) mock_config.compilation_config = compilation_config - mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs) + mock_config.scheduler_config = SchedulerConfig.default_factory( + max_num_seqs=max_num_seqs, + ) mock_config.parallel_config = ParallelConfig() mock_config.speculative_config = None # No speculative decoding if not lora_config: diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 3ba8ab26f5522..48be8c15aba9e 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -484,12 +484,6 @@ def test_encoder_instance_zero_kv_cache( vision encoder, so they don't need KV cache for text generation. """ # Form vllm config - scheduler_config = SchedulerConfig( - max_num_seqs=10, - max_num_batched_tokens=512, - max_model_len=512, - disable_hybrid_kv_cache_manager=True, - ) model_config = ModelConfig( model="llava-hf/llava-1.5-7b-hf", # Multimodal model enforce_eager=True, @@ -497,6 +491,13 @@ def test_encoder_instance_zero_kv_cache( dtype="float16", seed=42, ) + scheduler_config = SchedulerConfig( + max_num_seqs=10, + max_num_batched_tokens=512, + max_model_len=512, + disable_hybrid_kv_cache_manager=True, + is_encoder_decoder=model_config.is_encoder_decoder, + ) cache_config = CacheConfig( block_size=16, gpu_memory_utilization=gpu_memory_utilization, diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index f35f91bb3adf8..98f1f44923b1c 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -92,18 +92,19 @@ def create_vllm_config( enable_permute_local_kv: bool = False, ) -> VllmConfig: """Initialize VllmConfig For Testing.""" - scheduler_config = SchedulerConfig( - max_num_seqs=max_num_seqs, - max_num_batched_tokens=max_num_batched_tokens, - max_model_len=max_model_len, - enable_chunked_prefill=enable_chunked_prefill, - ) model_config = ModelConfig( model=model, trust_remote_code=True, dtype="float16", seed=42, ) + scheduler_config = SchedulerConfig( + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + max_model_len=max_model_len, + enable_chunked_prefill=enable_chunked_prefill, + is_encoder_decoder=model_config.is_encoder_decoder, + ) # Cache config, optionally force APC cache_config = CacheConfig( block_size=block_size, diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 9436ab471c21b..616e57de339e2 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -66,7 +66,10 @@ def _create_proposer( device_config=DeviceConfig(device=current_platform.device_type), parallel_config=ParallelConfig(), load_config=LoadConfig(), - scheduler_config=SchedulerConfig(), + scheduler_config=SchedulerConfig( + max_model_len=model_config.max_model_len, + is_encoder_decoder=model_config.is_encoder_decoder, + ), ) return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type) diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py index c5c0491abaf7c..3b8813ceb818a 100644 --- a/tests/v1/spec_decode/test_mtp.py +++ b/tests/v1/spec_decode/test_mtp.py @@ -51,7 +51,10 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer: device_config=DeviceConfig(device=current_platform.device_type), parallel_config=ParallelConfig(), load_config=LoadConfig(), - scheduler_config=SchedulerConfig(), + scheduler_config=SchedulerConfig( + max_model_len=model_config.max_model_len, + is_encoder_decoder=model_config.is_encoder_decoder, + ), ) return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 7b3a07b4e12a5..cfc06666e7984 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -26,16 +26,17 @@ from vllm.v1.worker.tpu_model_runner import ( def get_vllm_config(): - scheduler_config = SchedulerConfig( - max_num_seqs=10, - max_num_batched_tokens=512, - max_model_len=512, - ) model_config = ModelConfig( model="facebook/opt-125m", dtype="bfloat16", # TPUs typically use bfloat16 seed=42, ) + scheduler_config = SchedulerConfig( + max_num_seqs=10, + max_num_batched_tokens=512, + max_model_len=512, + is_encoder_decoder=model_config.is_encoder_decoder, + ) cache_config = CacheConfig( block_size=16, gpu_memory_utilization=0.9, diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 89669ee8b71a0..0439bef1226e3 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -79,16 +79,17 @@ def initialize_kv_cache(runner: GPUModelRunner): def get_vllm_config(): - scheduler_config = SchedulerConfig( - max_num_seqs=10, - max_num_batched_tokens=512, - max_model_len=512, - ) model_config = ModelConfig( model="facebook/opt-125m", dtype="float16", seed=42, ) + scheduler_config = SchedulerConfig( + max_num_seqs=10, + max_num_batched_tokens=512, + max_model_len=512, + is_encoder_decoder=model_config.is_encoder_decoder, + ) cache_config = CacheConfig( block_size=BLOCK_SIZE, gpu_memory_utilization=0.9, @@ -784,14 +785,15 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch): initialize_model_parallel(tensor_model_parallel_size=1) torch.set_default_dtype(torch.float16) + model_config = ModelConfig( + model="ibm-granite/granite-4.0-tiny-preview", + dtype="float16", + ) scheduler_config = SchedulerConfig( max_num_seqs=10, max_num_batched_tokens=512, max_model_len=512, - ) - model_config = ModelConfig( - model="ibm-granite/granite-4.0-tiny-preview", - dtype="float16", + is_encoder_decoder=model_config.is_encoder_decoder, ) cache_config = CacheConfig( block_size=BLOCK_SIZE, diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 1e089b42cccde..8da3ae538d671 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -28,6 +28,19 @@ SchedulerPolicy = Literal["fcfs", "priority"] class SchedulerConfig: """Scheduler configuration.""" + max_model_len: InitVar[int] + """Maximum length of a sequence (including prompt and generated text). + + Note: This is stored in the ModelConfig, and is used only here to + provide fallbacks and validate other attributes.""" + + is_encoder_decoder: InitVar[bool] + """True if the model is an encoder-decoder model. + + Note: This is stored in the ModelConfig, and is used only here to + disable chunked prefill and prefix caching for encoder-decoder models. + """ + DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048 DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128 @@ -73,19 +86,6 @@ class SchedulerConfig: is_multimodal_model: bool = False """True if the model is multimodal.""" - max_model_len: InitVar[int] = 8192 - """Maximum length of a sequence (including prompt and generated text). - - Note: This is stored in the ModelConfig, and is used only here to - provide fallbacks and validate other attributes.""" - - is_encoder_decoder: InitVar[bool] = False - """True if the model is an encoder-decoder model. - - Note: This is stored in the ModelConfig, and is used only here to - disable chunked prefill and prefix caching for encoder-decoder models. - """ - # TODO (ywang96): Make this configurable. max_num_encoder_input_tokens: int = Field(init=False) """Multimodal encoder compute budget, only used in V1. @@ -141,6 +141,17 @@ class SchedulerConfig: while a larger value (e.g., 10) reduces host overhead and may increase throughput by batching multiple tokens before sending.""" + @staticmethod + def default_factory(**kwargs): + """ + Factory method to create `SchedulerConfig` with default values for `InitVar`s. + """ + if "max_model_len" not in kwargs: + kwargs["max_model_len"] = 8192 + if "is_encoder_decoder" not in kwargs: + kwargs["is_encoder_decoder"] = False + return SchedulerConfig(**kwargs) + def get_scheduler_cls(self) -> type["SchedulerInterface"]: if self.scheduler_cls is None: if self.async_scheduling: @@ -284,8 +295,3 @@ class SchedulerConfig: ) return self - - def __getattribute__(self, name: str) -> Any: - if name == "max_model_len" or name == "is_encoder_decoder": - raise AttributeError(f"{name} is an init-only parameter. ") - return object.__getattribute__(self, name) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 615b1f8489eff..5b3a9c437662b 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -170,7 +170,9 @@ class VllmConfig: """Cache configuration.""" parallel_config: ParallelConfig = Field(default_factory=ParallelConfig) """Parallel configuration.""" - scheduler_config: SchedulerConfig = Field(default_factory=SchedulerConfig) + scheduler_config: SchedulerConfig = Field( + default_factory=SchedulerConfig.default_factory, + ) """Scheduler configuration.""" device_config: DeviceConfig = Field(default_factory=DeviceConfig) """Device configuration.""" From 68ffbca7e462cfa6a32b46dabc9a604c7c1b918d Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 2 Dec 2025 20:30:40 +0800 Subject: [PATCH 22/45] [Chore] Use `tokenizer.encode` and `tokenizer.decode` directly (#29851) Signed-off-by: DarkLight1337 --- .../multimodal/processing/test_common.py | 14 ++++++----- .../multimodal/processing/test_llama4.py | 3 +-- vllm/entrypoints/openai/speech_to_text.py | 2 +- vllm/entrypoints/renderer.py | 4 +-- vllm/entrypoints/score_utils.py | 2 +- .../model_executor/models/nano_nemotron_vl.py | 25 ++++++++----------- .../models/qwen2_5_omni_thinker.py | 3 +-- vllm/multimodal/processing.py | 19 ++++++-------- vllm/transformers_utils/tokenizer.py | 4 +++ 9 files changed, 36 insertions(+), 40 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 90158a028b0bd..8ef1fba8df3e3 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -22,8 +22,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import MultiModalInputs from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext -from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config -from vllm.transformers_utils.tokenizer import encode_tokens +from vllm.tokenizers import ( + MistralTokenizer, + TokenizerLike, + cached_tokenizer_from_config, +) from ....multimodal.utils import random_audio, random_image, random_video from ...registry import ( @@ -151,7 +154,7 @@ def get_text_token_prompts( mm_data: MultiModalDataDict, ): dummy_inputs = processor.dummy_inputs - tokenizer = processor.info.get_tokenizer() + tokenizer: TokenizerLike = processor.info.get_tokenizer() model_config = processor.info.ctx.model_config model_type = model_config.hf_config.model_type @@ -188,10 +191,9 @@ def get_text_token_prompts( assert isinstance(inputs.prompt, str) text_prompt = inputs.prompt - token_prompt = encode_tokens( - tokenizer, + token_prompt = tokenizer.encode( text_prompt, - add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type), + add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type, True), ) return text_prompt, token_prompt diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py index 4c0791ea3cece..b73246b68b36a 100644 --- a/tests/models/multimodal/processing/test_llama4.py +++ b/tests/models/multimodal/processing/test_llama4.py @@ -5,7 +5,6 @@ import pytest from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.transformers_utils.tokenizer import encode_tokens from ....conftest import ImageTestAssets from ...utils import build_model_context @@ -48,7 +47,7 @@ def test_processor_override( ] } if tokenized_prompt: - prompt = encode_tokens(tokenizer, prompt) + prompt = tokenizer.encode(prompt) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) mm_data = processed_inputs["mm_kwargs"].get_data() diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index b34446d3230b1..cea9924ebbaca 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -37,7 +37,7 @@ from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.models import SupportsTranscription from vllm.outputs import RequestOutput -from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.tokenizers import get_tokenizer from vllm.utils.import_utils import PlaceholderModule try: diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py index 10b90bbbb0f32..f31b309b8ca48 100644 --- a/vllm/entrypoints/renderer.py +++ b/vllm/entrypoints/renderer.py @@ -33,7 +33,7 @@ class RenderConfig: `0` yields an empty list (and skips embeds). `-1` maps to `model_config.max_model_len`.""" - add_special_tokens: bool | None = True + add_special_tokens: bool = True """Whether to add model-specific special tokens during tokenization.""" cache_salt: str | None = None @@ -315,7 +315,7 @@ class CompletionRenderer(BaseRenderer): text: str, max_length: int | None, truncate_prompt_tokens: int | None, - add_special_tokens: bool | None, + add_special_tokens: bool, cache_salt: str | None, ) -> EngineTokensPrompt: """Tokenize text input asynchronously.""" diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 602f59ac09f55..8819c85af9a26 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -19,7 +19,7 @@ from vllm.inputs import TokensPrompt from vllm.model_executor.models.interfaces import supports_score_template from vllm.multimodal.inputs import MultiModalDataDict from vllm.outputs import PoolingRequestOutput -from vllm.transformers_utils.tokenizer import TokenizerLike +from vllm.tokenizers import TokenizerLike ScoreContentPartParam: TypeAlias = ( ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 0f86a17752802..891a9ce080233 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -75,7 +75,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config from vllm.transformers_utils.configs.radio import RadioConfig -from vllm.transformers_utils.tokenizer import encode_tokens from vllm.utils.tensor_schema import TensorSchema, TensorShape from .utils import _merge_multimodal_embeddings @@ -454,14 +453,12 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): # Pre-tokenize special tokens for video processing # to avoid repeated tokenization - self._img_start_token_ids = encode_tokens( - tokenizer, IMG_START, add_special_tokens=False + self._img_start_token_ids = tokenizer.encode( + IMG_START, add_special_tokens=False ) - self._img_end_token_ids = encode_tokens( - tokenizer, IMG_END, add_special_tokens=False - ) - self._img_context_token_ids = encode_tokens( - tokenizer, IMG_CONTEXT, add_special_tokens=False + self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False) + self._img_context_token_ids = tokenizer.encode( + IMG_CONTEXT, add_special_tokens=False ) @property @@ -1179,14 +1176,12 @@ class NemotronH_Nano_VL_V2( # Pre-tokenize special tokens for video processing # to avoid repeated tokenization tokenizer = cached_tokenizer_from_config(vllm_config.model_config) - self._img_start_token_ids = encode_tokens( - tokenizer, IMG_START, add_special_tokens=False + self._img_start_token_ids = tokenizer.encode( + IMG_START, add_special_tokens=False ) - self._img_end_token_ids = encode_tokens( - tokenizer, IMG_END, add_special_tokens=False - ) - self._img_context_token_ids = encode_tokens( - tokenizer, IMG_CONTEXT, add_special_tokens=False + self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False) + self._img_context_token_ids = tokenizer.encode( + IMG_CONTEXT, add_special_tokens=False ) def pixel_shuffle(self, x, scale_factor=0.5): diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 7506ee8656fda..1ce0fb4e4d93d 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -88,7 +88,6 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.tokenizer import encode_tokens from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( @@ -591,7 +590,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor( tokenization_kwargs=tokenization_kwargs, ) tokenizer = self.info.get_tokenizer() - prompt_ids = encode_tokens(tokenizer, prompt) + prompt_ids = tokenizer.encode(prompt) else: prompt_ids = self._apply_hf_processor_tokens_only(prompt) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 2f651bd71706f..f241e79cfa7cb 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -25,7 +25,6 @@ from typing_extensions import TypeVar, assert_never from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.processor import cached_processor_from_config -from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens from vllm.utils.collection_utils import flatten_2d_lists, full_groupby from vllm.utils.func_utils import get_allowed_kwarg_only_overrides from vllm.utils.jsontree import JSONTree, json_map_leaves @@ -80,9 +79,9 @@ def _cached_encode( tokenizer: TokenizerLike, text: str, *, - add_special_tokens: bool | None = None, + add_special_tokens: bool = True, ) -> list[int]: - return encode_tokens(tokenizer, text, add_special_tokens=add_special_tokens) + return tokenizer.encode(text, add_special_tokens=add_special_tokens) @lru_cache(maxsize=2048) @@ -90,11 +89,9 @@ def _cached_decode( tokenizer: TokenizerLike, token_ids: tuple[int, ...], *, - skip_special_tokens: bool | None = None, + skip_special_tokens: bool = False, ) -> str: - return decode_tokens( - tokenizer, list(token_ids), skip_special_tokens=skip_special_tokens - ) + return tokenizer.decode(list(token_ids), skip_special_tokens=skip_special_tokens) def _seq2text( @@ -110,7 +107,7 @@ def _seq2text( raise ValueError("You cannot decode tokens when `skip_tokenizer_init=True`") if not use_cache: - return decode_tokens(tokenizer, seq) + return tokenizer.decode(seq) return _cached_decode(tokenizer, tuple(seq)) @@ -126,7 +123,7 @@ def _seq2tokens( raise ValueError("You cannot encode text when `skip_tokenizer_init=True`") if not use_cache: - return encode_tokens(tokenizer, seq, add_special_tokens=False) + return tokenizer.encode(seq, add_special_tokens=False) return _cached_encode(tokenizer, seq, add_special_tokens=False) @@ -2198,8 +2195,8 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): tokenizer = self.info.get_tokenizer() decoder_prompt_raw = self.create_decoder_prompt(prompt, mm_data) if isinstance(decoder_prompt_raw, str): - decoder_prompt_ids = encode_tokens( - tokenizer, decoder_prompt_raw, add_special_tokens=False + decoder_prompt_ids = tokenizer.encode( + decoder_prompt_raw, add_special_tokens=False ) else: decoder_prompt_ids = decoder_prompt_raw diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 617d16779ca26..32999903b3480 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -4,6 +4,8 @@ import warnings from typing import Any +from typing_extensions import deprecated + from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike @@ -73,6 +75,7 @@ def __getattr__(name: str): raise AttributeError(f"module {__name__!r} has no attribute {name!r}") +@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.") def decode_tokens( tokenizer: TokenizerLike, token_ids: list[int], @@ -94,6 +97,7 @@ def decode_tokens( return tokenizer.decode(token_ids, **kw_args) +@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.") def encode_tokens( tokenizer: TokenizerLike, text: str, From 60c3d413afccab6a1f9a18cf3cd1fe11019c1040 Mon Sep 17 00:00:00 2001 From: ImaGoodFella <31959740+ImaGoodFella@users.noreply.github.com> Date: Tue, 2 Dec 2025 14:49:02 +0100 Subject: [PATCH 23/45] [Multimodal][Core] Optimize multimodal preprocessing cache by hashing image bytes instead of pixel values (#29621) Signed-off-by: Rahul Steiger Co-authored-by: Cyrus Leung --- tests/conftest.py | 7 ++++- tests/entrypoints/openai/test_vision.py | 7 ++++- .../pooling/embed/test_online_vision.py | 7 ++++- vllm/multimodal/base.py | 28 +++++++++++++++++++ vllm/multimodal/hasher.py | 24 ++++++++++++---- vllm/multimodal/image.py | 24 +++++++++------- vllm/multimodal/parse.py | 15 ++++++++++ vllm/multimodal/processing.py | 2 +- 8 files changed, 95 insertions(+), 19 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 53bbaddd0bb7f..b20c9efef542a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -59,6 +59,7 @@ from vllm.distributed import ( ) from vllm.logger import init_logger from vllm.logprobs import Logprob +from vllm.multimodal.base import MediaWithBytes from vllm.multimodal.utils import fetch_image from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams @@ -1389,7 +1390,11 @@ class LocalAssetServer: return f"{self.base_url}/{name}" def get_image_asset(self, name: str) -> Image.Image: - return fetch_image(self.url_for(name)) + image = fetch_image(self.url_for(name)) + # Unwrap MediaWithBytes if present + if isinstance(image, MediaWithBytes): + image = image.media + return image @pytest.fixture(scope="session") diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index d83c6726e72da..ae8860ee877b4 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -8,6 +8,7 @@ import pytest import pytest_asyncio from transformers import AutoProcessor +from vllm.multimodal.base import MediaWithBytes from vllm.multimodal.utils import encode_image_base64, fetch_image from ...utils import RemoteOpenAIServer @@ -111,7 +112,11 @@ def get_hf_prompt_tokens(model_name, content, image_url): "content": f"{placeholder}{content}", } ] - images = [fetch_image(image_url)] + image = fetch_image(image_url) + # Unwrap MediaWithBytes if present + if isinstance(image, MediaWithBytes): + image = image.media + images = [image] prompt = processor.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True diff --git a/tests/entrypoints/pooling/embed/test_online_vision.py b/tests/entrypoints/pooling/embed/test_online_vision.py index 83e7048b9def6..eebbcdd2e4396 100644 --- a/tests/entrypoints/pooling/embed/test_online_vision.py +++ b/tests/entrypoints/pooling/embed/test_online_vision.py @@ -9,6 +9,7 @@ from transformers import AutoProcessor from tests.utils import VLLM_PATH, RemoteOpenAIServer from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse +from vllm.multimodal.base import MediaWithBytes from vllm.multimodal.utils import encode_image_base64, fetch_image MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" @@ -62,7 +63,11 @@ def get_hf_prompt_tokens(model_name, content, image_url): placeholder = "<|image_1|> " prompt = f"{placeholder}{content}" - images = [fetch_image(image_url)] + image = fetch_image(image_url) + # Unwrap MediaWithBytes if present + if isinstance(image, MediaWithBytes): + image = image.media + images = [image] inputs = processor(prompt, images, return_tensors="pt") return inputs.input_ids.shape[1] diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index fef118a93c6cb..4a619fd303ca9 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -2,12 +2,40 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod +from dataclasses import dataclass from pathlib import Path from typing import Generic, TypeVar +import numpy as np + _T = TypeVar("_T") +@dataclass +class MediaWithBytes(Generic[_T]): + """ + Wrapper that couples a media object with its original encoded bytes. + + This ensures the raw bytes and media object remain synchronized, + preventing cache corruption from in-place modifications. + + The wrapper delegates attribute access to the underlying media object, + making it behave transparently like the wrapped type (e.g., PIL.Image). + """ + + media: _T + original_bytes: bytes + + def __array__(self, *args, **kwargs) -> np.ndarray: + """Allow np.array(obj) to return np.array(obj.media).""" + return np.array(self.media, *args, **kwargs) + + def __getattr__(self, name: str): + """Delegate attribute access to the underlying media object.""" + # This is only called when the attribute is not found on self + return getattr(self.media, name) + + class MediaIO(ABC, Generic[_T]): @abstractmethod def load_bytes(self, data: bytes) -> _T: diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index d0dcbb25fcce8..cc50322fed902 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -12,6 +12,8 @@ from PIL import Image from vllm.logger import init_logger +from .base import MediaWithBytes + logger = init_logger(__name__) @@ -31,14 +33,26 @@ class MultiModalHasher: if Image.ExifTags.Base.ImageID in exif and isinstance( exif[Image.ExifTags.Base.ImageID], uuid.UUID ): - # If the image has exif ImageID tag, use that return (exif[Image.ExifTags.Base.ImageID].bytes,) + data = {"mode": obj.mode, "data": np.asarray(obj)} - if obj.palette is not None: - data["palette"] = obj.palette.palette - if obj.palette.rawmode is not None: - data["palette_rawmode"] = obj.palette.rawmode + palette = obj.palette + if palette is not None: + data["palette"] = palette.palette + if palette.rawmode is not None: + data["palette_rawmode"] = palette.rawmode + return cls.iter_item_to_bytes("image", data) + + if isinstance(obj, MediaWithBytes) and isinstance(obj.media, Image.Image): + exif = obj.media.getexif() + if Image.ExifTags.Base.ImageID in exif and isinstance( + exif[Image.ExifTags.Base.ImageID], uuid.UUID + ): + return (exif[Image.ExifTags.Base.ImageID].bytes,) + + return cls.iter_item_to_bytes("image", obj.original_bytes) + if isinstance(obj, torch.Tensor): tensor_obj: torch.Tensor = obj.cpu() tensor_dtype = tensor_obj.dtype diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 21e8bef97a787..789421e9e0c3b 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -8,7 +8,7 @@ import pybase64 import torch from PIL import Image -from .base import MediaIO +from .base import MediaIO, MediaWithBytes def rescale_image_size( @@ -74,8 +74,12 @@ class ImageMediaIO(MediaIO[Image.Image]): ) self.rgba_background_color = rgba_bg - def _convert_image_mode(self, image: Image.Image) -> Image.Image: + def _convert_image_mode( + self, image: Image.Image | MediaWithBytes[Image.Image] + ) -> Image.Image: """Convert image mode with custom background color.""" + if isinstance(image, MediaWithBytes): + image = image.media if image.mode == self.image_mode: return image elif image.mode == "RGBA" and self.image_mode == "RGB": @@ -83,18 +87,18 @@ class ImageMediaIO(MediaIO[Image.Image]): else: return convert_image_mode(image, self.image_mode) - def load_bytes(self, data: bytes) -> Image.Image: + def load_bytes(self, data: bytes) -> MediaWithBytes[Image.Image]: image = Image.open(BytesIO(data)) - image.load() - return self._convert_image_mode(image) + return MediaWithBytes(self._convert_image_mode(image), data) - def load_base64(self, media_type: str, data: str) -> Image.Image: + def load_base64(self, media_type: str, data: str) -> MediaWithBytes[Image.Image]: return self.load_bytes(pybase64.b64decode(data, validate=True)) - def load_file(self, filepath: Path) -> Image.Image: - image = Image.open(filepath) - image.load() - return self._convert_image_mode(image) + def load_file(self, filepath: Path) -> MediaWithBytes[Image.Image]: + with open(filepath, "rb") as f: + data = f.read() + image = Image.open(BytesIO(data)) + return MediaWithBytes(self._convert_image_mode(image), data) def encode_base64( self, diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 810f29072a0fe..0d3b8289e4e12 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -23,6 +23,7 @@ from vllm.utils.collection_utils import is_list_of from vllm.utils.import_utils import LazyLoader from .audio import AudioResampler +from .base import MediaWithBytes from .inputs import ( AudioItem, HfAudioItem, @@ -84,6 +85,12 @@ class ModalityDataItems(ABC, Generic[_T, _I]): """Get all data items.""" return [self.get(idx) for idx in range(self.get_count())] + def get_item_for_hash(self, index: int) -> object: + return self.get(index) + + def get_all_items_for_hash(self) -> list[object]: + return [self.get_item_for_hash(idx) for idx in range(self.get_count())] + @abstractmethod def get_processor_data(self) -> Mapping[str, object]: """Get the data to pass to the HF processor.""" @@ -98,10 +105,18 @@ class ModalityDataItems(ABC, Generic[_T, _I]): class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): """Base class for data items that are arranged in a list.""" + def _unwrap(self, item: _T | MediaWithBytes[_T]) -> _T: + """Extract media from wrapper if present.""" + return item.media if isinstance(item, MediaWithBytes) else item + def get_count(self) -> int: return len(self.data) def get(self, index: int) -> _T: + return self._unwrap(self.data[index]) + + def get_item_for_hash(self, index: int) -> _T | MediaWithBytes[_T]: + # Return raw item for hashing (preserves original_bytes if present) return self.data[index] def get_processor_data(self) -> Mapping[str, object]: diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index f241e79cfa7cb..0390773783961 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1684,7 +1684,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): # For None entries, compute a hash; otherwise, use provided ID. computed: list[str] = [] - for i, item in enumerate(items): + for i, item in enumerate(items.get_all_items_for_hash()): item_uuid = mm_uuids_per_modality[i] # NOTE: Even if a item_uuid is provided, we still compute a From 51c57b51dd51d87715367850faae1da7a9cabaef Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 2 Dec 2025 10:52:18 -0500 Subject: [PATCH 24/45] [Bugfix] Fix DeepSeek R1 MTP weight loading (#29545) Signed-off-by: Matthew Bonanni Co-authored-by: Benjamin Chislett --- vllm/model_executor/models/deepseek_mtp.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 6e23037b919ab..ca77b8322e2e8 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -346,11 +346,16 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts): # Use expert_params_mapping to locate the destination # param and delegate to its expert-aware weight_loader # with expert_id. + is_expert_weight = False for mapping in expert_params_mapping: param_name, weight_name, expert_id, shard_id = mapping if weight_name not in chunk_name: continue + # Anyway, this is an expert weight and should not be + # attempted to load as other weights later + is_expert_weight = True + # Do not modify `name` since the loop may continue here # Instead, create a new variable name_mapped = chunk_name.replace(weight_name, param_name) @@ -377,6 +382,12 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts): loaded_params.add(name_mapped) break else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it + continue + # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue From 2eb4fe912916aea8998d085786df7abd7737e1f3 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 2 Dec 2025 23:54:28 +0800 Subject: [PATCH 25/45] [examples] Resettle pooling examples. (#29365) Signed-off-by: wang.yuqi Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 22 +-- .github/CODEOWNERS | 4 +- docs/.nav.yml | 6 +- docs/design/io_processor_plugins.md | 2 +- docs/mkdocs/hooks/generate_examples.py | 148 ++++++++++-------- docs/models/pooling_models.md | 4 +- docs/models/supported_models.md | 4 +- docs/serving/openai_compatible_server.md | 14 +- examples/offline_inference/pooling/README.md | 57 ------- examples/online_serving/pooling/README.md | 97 ------------ .../classify}/openai_classification_client.py | 0 .../embed}/embed_jina_embeddings_v3.py | 0 .../embed}/embed_matryoshka_fy.py | 0 .../embedding_requests_base64_client.py | 0 .../embed}/embedding_requests_bytes_client.py | 0 ...ai_chat_embedding_client_for_multimodal.py | 0 .../embed}/openai_embedding_client.py | 0 .../openai_embedding_long_text/README.md | 0 .../openai_embedding_long_text/client.py | 0 .../openai_embedding_long_text/service.sh | 0 .../embed}/openai_embedding_matryoshka_fy.py | 0 .../plugin/prithvi_geospatial_mae_client.py} | 0 .../prithvi_geospatial_mae_io_processor.py | 0 .../plugin/prithvi_geospatial_mae_offline.py} | 0 .../pooling/openai_pooling_client.py | 0 .../pooling}/vision_language_pooling.py | 0 .../score}/cohere_rerank_client.py | 0 .../score}/convert_model_to_seq_cls.py | 0 .../score}/jinaai_rerank_client.py | 0 .../score}/openai_cross_encoder_score.py | 0 ...enai_cross_encoder_score_for_multimodal.py | 0 .../score}/qwen3_reranker.py | 0 .../pooling => pooling/token_classify}/ner.py | 0 .../token_classify}/ner_client.py | 0 .../token_embed}/multi_vector_retrieval.py | 0 .../multi_vector_retrieval_client.py | 0 36 files changed, 109 insertions(+), 249 deletions(-) delete mode 100644 examples/offline_inference/pooling/README.md delete mode 100644 examples/online_serving/pooling/README.md rename examples/{online_serving/pooling => pooling/classify}/openai_classification_client.py (100%) rename examples/{offline_inference/pooling => pooling/embed}/embed_jina_embeddings_v3.py (100%) rename examples/{offline_inference/pooling => pooling/embed}/embed_matryoshka_fy.py (100%) rename examples/{online_serving/pooling => pooling/embed}/embedding_requests_base64_client.py (100%) rename examples/{online_serving/pooling => pooling/embed}/embedding_requests_bytes_client.py (100%) rename examples/{online_serving/pooling => pooling/embed}/openai_chat_embedding_client_for_multimodal.py (100%) rename examples/{online_serving/pooling => pooling/embed}/openai_embedding_client.py (100%) rename examples/{online_serving => pooling/embed}/openai_embedding_long_text/README.md (100%) rename examples/{online_serving => pooling/embed}/openai_embedding_long_text/client.py (100%) rename examples/{online_serving => pooling/embed}/openai_embedding_long_text/service.sh (100%) rename examples/{online_serving/pooling => pooling/embed}/openai_embedding_matryoshka_fy.py (100%) rename examples/{online_serving/pooling/prithvi_geospatial_mae.py => pooling/plugin/prithvi_geospatial_mae_client.py} (100%) rename examples/{offline_inference/pooling => pooling/plugin}/prithvi_geospatial_mae_io_processor.py (100%) rename examples/{offline_inference/pooling/prithvi_geospatial_mae.py => pooling/plugin/prithvi_geospatial_mae_offline.py} (100%) rename examples/{online_serving => pooling}/pooling/openai_pooling_client.py (100%) rename examples/{offline_inference => pooling/pooling}/vision_language_pooling.py (100%) rename examples/{online_serving/pooling => pooling/score}/cohere_rerank_client.py (100%) rename examples/{offline_inference/pooling => pooling/score}/convert_model_to_seq_cls.py (100%) rename examples/{online_serving/pooling => pooling/score}/jinaai_rerank_client.py (100%) rename examples/{online_serving/pooling => pooling/score}/openai_cross_encoder_score.py (100%) rename examples/{online_serving/pooling => pooling/score}/openai_cross_encoder_score_for_multimodal.py (100%) rename examples/{offline_inference/pooling => pooling/score}/qwen3_reranker.py (100%) rename examples/{offline_inference/pooling => pooling/token_classify}/ner.py (100%) rename examples/{online_serving/pooling => pooling/token_classify}/ner_client.py (100%) rename examples/{offline_inference/pooling => pooling/token_embed}/multi_vector_retrieval.py (100%) rename examples/{online_serving/pooling => pooling/token_embed}/multi_vector_retrieval_client.py (100%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9f2107fb1e5ab..52c848c784e53 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -390,20 +390,24 @@ steps: - examples/ commands: - pip install tensorizer # for tensorizer test + # for basic + - python3 offline_inference/basic/chat.py - python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/chat.py - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/audio_language.py --seed 0 - - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_pooling.py --seed 0 - - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py + # for multi-modal models + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + # for pooling models + - python3 pooling/pooling/vision_language_pooling.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ecb10d1a450f3..d6447649cd89a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -146,10 +146,10 @@ mkdocs.yaml @hmellor /requirements/kv_connectors.txt @NickLucche # Pooling models -/examples/*/pooling/ @noooop +/examples/pooling @noooop /tests/models/*/pooling* @noooop /tests/entrypoints/pooling @noooop -/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop +/vllm/entrypoints/pooling @noooop /vllm/config/pooler.py @noooop /vllm/pooling_params.py @noooop /vllm/model_executor/layers/pooler.py @noooop diff --git a/docs/.nav.yml b/docs/.nav.yml index d30c0f12eba4c..aa98ad52be215 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -5,11 +5,7 @@ nav: - Getting Started: - getting_started/quickstart.md - getting_started/installation - - Examples: - - examples/README.md - - Offline Inference: examples/offline_inference - - Online Serving: examples/online_serving - - Others: examples/others + - Examples: examples - General: - usage/v1_guide.md - usage/* diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index b4a30cda35a01..5a86940fa9f13 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -79,7 +79,7 @@ The `post_process*` methods take `PoolingRequestOutput` objects as input and gen The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters. The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py). -An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples. +An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/pooling/plugin/prithvi_geospatial_mae_client.py](../../examples/pooling/plugin/prithvi_geospatial_mae_client.py)) and offline ([examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py](../../examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py)) inference examples. ## Using an IO Processor plugin diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 6e4fb039e3a07..e886a91e65732 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -2,7 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools import logging -from dataclasses import dataclass, field +from dataclasses import dataclass +from functools import cached_property from pathlib import Path from typing import Literal @@ -16,13 +17,18 @@ EXAMPLE_DIR = ROOT_DIR / "examples" EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples" -def fix_case(text: str) -> str: +def title(text: str) -> str: + # Default title case + text = text.replace("_", " ").replace("/", " - ").title() + # Custom substitutions subs = { + "io": "IO", "api": "API", "cli": "CLI", "cpu": "CPU", "llm": "LLM", "mae": "MAE", + "ner": "NER", "tpu": "TPU", "gguf": "GGUF", "lora": "LoRA", @@ -48,71 +54,65 @@ class Example: Attributes: path (Path): The path to the main directory or file. category (str): The category of the document. - main_file (Path): The main file in the directory. - other_files (list[Path]): list of other files in the directory. - title (str): The title of the document. + + Properties:: + main_file() -> Path | None: Determines the main file in the given path. + other_files() -> list[Path]: Determines other files in the directory excluding + the main file. + title() -> str: Determines the title of the document. Methods: - __post_init__(): Initializes the main_file, other_files, and title attributes. - determine_main_file() -> Path: Determines the main file in the given path. - determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file. - determine_title() -> str: Determines the title of the document. generate() -> str: Generates the documentation content. - """ # noqa: E501 + """ path: Path - category: str = None - main_file: Path = field(init=False) - other_files: list[Path] = field(init=False) - title: str = field(init=False) + category: str - def __post_init__(self): - self.main_file = self.determine_main_file() - self.other_files = self.determine_other_files() - self.title = self.determine_title() + @cached_property + def main_file(self) -> Path | None: + """Determines the main file in the given path. - @property - def is_code(self) -> bool: - return self.main_file.suffix != ".md" + If path is a file, it returns the path itself. If path is a directory, it + searches for Markdown files (*.md) in the directory and returns the first one + found. If no Markdown files are found, it returns None.""" + # Single file example + if self.path.is_file(): + return self.path + # Multi file example with a README + if md_paths := list(self.path.glob("*.md")): + return md_paths[0] + # Multi file example without a README + return None - def determine_main_file(self) -> Path: - """ - Determines the main file in the given path. - If the path is a file, it returns the path itself. Otherwise, it searches - for Markdown files (*.md) in the directory and returns the first one found. - Returns: - Path: The main file path, either the original path if it's a file or the first - Markdown file found in the directory. - Raises: - IndexError: If no Markdown files are found in the directory. - """ # noqa: E501 - return self.path if self.path.is_file() else list(self.path.glob("*.md")).pop() + @cached_property + def other_files(self) -> list[Path]: + """Determine other files in the directory excluding the main file. - def determine_other_files(self) -> list[Path]: - """ - Determine other files in the directory excluding the main file. - - This method checks if the given path is a file. If it is, it returns an empty list. - Otherwise, it recursively searches through the directory and returns a list of all - files that are not the main file. - - Returns: - list[Path]: A list of Path objects representing the other files in the directory. - """ # noqa: E501 + If path is a file, it returns an empty list. Otherwise, it returns every file + in the directory except the main file in a list.""" + # Single file example if self.path.is_file(): return [] + # Multi file example is_other_file = lambda file: file.is_file() and file != self.main_file - return [file for file in self.path.rglob("*") if is_other_file(file)] + return sorted(file for file in self.path.rglob("*") if is_other_file(file)) - def determine_title(self) -> str: - if not self.is_code: - # Specify encoding for building on Windows - with open(self.main_file, encoding="utf-8") as f: - first_line = f.readline().strip() - match = re.match(r"^#\s+(?P.+)$", first_line) - if match: - return match.group("title") - return fix_case(self.path.stem.replace("_", " ").title()) + @cached_property + def is_code(self) -> bool: + return self.main_file is not None and self.main_file.suffix != ".md" + + @cached_property + def title(self) -> str: + # Generate title from filename if no main md file found + if self.main_file is None or self.is_code: + return title(self.path.stem) + # Specify encoding for building on Windows + with open(self.main_file, encoding="utf-8") as f: + first_line = f.readline().strip() + match = re.match(r"^#\s+(?P<title>.+)$", first_line) + if match: + return match.group("title") + raise ValueError(f"Title not found in {self.main_file}") def fix_relative_links(self, content: str) -> str: """ @@ -156,24 +156,35 @@ class Example: # included files containing code fences too code_fence = "``````" - if self.is_code: - content += ( - f"{code_fence}{self.main_file.suffix[1:]}\n" - f'--8<-- "{self.main_file}"\n' - f"{code_fence}\n" - ) + if self.main_file is not None: + # Single file example or multi file example with a README + if self.is_code: + content += ( + f"{code_fence}{self.main_file.suffix[1:]}\n" + f'--8<-- "{self.main_file}"\n' + f"{code_fence}\n" + ) + else: + with open(self.main_file, encoding="utf-8") as f: + # Skip the title from md snippets as it's been included above + main_content = f.readlines()[1:] + content += self.fix_relative_links("".join(main_content)) + content += "\n" else: - with open(self.main_file) as f: - # Skip the title from md snippets as it's been included above - main_content = f.readlines()[1:] - content += self.fix_relative_links("".join(main_content)) - content += "\n" + # Multi file example without a README + for file in self.other_files: + file_title = title(str(file.relative_to(self.path).with_suffix(""))) + content += f"## {file_title}\n\n" + content += ( + f'{code_fence}{file.suffix[1:]}\n--8<-- "{file}"\n{code_fence}\n\n' + ) + return content if not self.other_files: return content content += "## Example materials\n\n" - for file in sorted(self.other_files): + for file in self.other_files: content += f'??? abstract "{file.relative_to(self.path)}"\n' if file.suffix != ".md": content += f" {code_fence}{file.suffix[1:]}\n" @@ -200,11 +211,13 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): glob_patterns = ["*.py", "*.md", "*.sh"] # Find categorised examples for category in categories: + logger.info("Processing category: %s", category.stem) globs = [category.glob(pattern) for pattern in glob_patterns] for path in itertools.chain(*globs): examples.append(Example(path, category.stem)) # Find examples in subdirectories - for path in category.glob("*/*.md"): + globs = [category.glob(f"*/{pattern}") for pattern in glob_patterns] + for path in itertools.chain(*globs): examples.append(Example(path.parent, category.stem)) # Generate the example documentation @@ -217,3 +230,4 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): with open(doc_path, "w+", encoding="utf-8") as f: f.write(example.generate()) logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR)) + logger.info("Total examples generated: %d", len(examples)) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index aca865f4bf77d..e2d427e8a4590 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -274,7 +274,7 @@ outputs = llm.embed( print(outputs[0].outputs) ``` -A code example can be found here: [examples/offline_inference/pooling/embed_matryoshka_fy.py](../../examples/offline_inference/pooling/embed_matryoshka_fy.py) +A code example can be found here: [examples/pooling/embed/embed_matryoshka_fy.py](../../examples/pooling/embed/embed_matryoshka_fy.py) ### Online Inference @@ -304,7 +304,7 @@ Expected output: {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}} ``` -An OpenAI client example can be found here: [examples/online_serving/pooling/openai_embedding_matryoshka_fy.py](../../examples/online_serving/pooling/openai_embedding_matryoshka_fy.py) +An OpenAI client example can be found here: [examples/pooling/embed/openai_embedding_matryoshka_fy.py](../../examples/pooling/embed/openai_embedding_matryoshka_fy.py) ## Deprecated Features diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 6ea2285b92bb8..040107c11efcf 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -568,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A ``` !!! note - Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/offline_inference/pooling/qwen3_reranker.py](../../examples/offline_inference/pooling/qwen3_reranker.py). + Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py). ```bash vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' @@ -606,7 +606,7 @@ These models primarily support the [`LLM.encode`](./pooling_models.md#llmencode) | `ModernBertForTokenClassification` | ModernBERT-based | `disham993/electrical-ner-ModernBERT-base` | | | !!! note - Named Entity Recognition (NER) usage, please refer to [examples/offline_inference/pooling/ner.py](../../examples/offline_inference/pooling/ner.py), [examples/online_serving/pooling/ner_client.py](../../examples/online_serving/pooling/ner_client.py). + Named Entity Recognition (NER) usage, please refer to [examples/pooling/token_classify/ner.py](../../examples/pooling/token_classify/ner.py), [examples/pooling/token_classify/ner_client.py](../../examples/pooling/token_classify/ner_client.py). ## List of Multimodal Language Models diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 672663dc50b1e..01453483a8d60 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -234,7 +234,7 @@ The following extra parameters are supported: Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -Code example: [examples/online_serving/pooling/openai_embedding_client.py](../../examples/online_serving/pooling/openai_embedding_client.py) +Code example: [examples/pooling/embed/openai_embedding_client.py](../../examples/pooling/embed/openai_embedding_client.py) If the model has a [chat template](../serving/openai_compatible_server.md#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api)) which will be treated as a single prompt to the model. Here is a convenience function for calling the API while retaining OpenAI's type annotations: @@ -335,7 +335,7 @@ and passing a list of `messages` in the request. Refer to the examples below for `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code example below for details. -Full example: [examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py](../../examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py) +Full example: [examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py](../../examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py) #### Extra parameters @@ -516,7 +516,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_ The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. -Code example: [examples/online_serving/pooling/openai_pooling_client.py](../../examples/online_serving/pooling/openai_pooling_client.py) +Code example: [examples/pooling/pooling/openai_pooling_client.py](../../examples/pooling/pooling/openai_pooling_client.py) ### Classification API @@ -524,7 +524,7 @@ Our Classification API directly supports Hugging Face sequence-classification mo We automatically wrap any other transformer via `as_seq_cls_model()`, which pools on the last token, attaches a `RowParallelLinear` head, and applies a softmax to produce per-class probabilities. -Code example: [examples/online_serving/pooling/openai_classification_client.py](../../examples/online_serving/pooling/openai_classification_client.py) +Code example: [examples/pooling/classify/openai_classification_client.py](../../examples/pooling/classify/openai_classification_client.py) #### Example Requests @@ -640,7 +640,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). -Code example: [examples/online_serving/pooling/openai_cross_encoder_score.py](../../examples/online_serving/pooling/openai_cross_encoder_score.py) +Code example: [examples/pooling/score/openai_cross_encoder_score.py](../../examples/pooling/score/openai_cross_encoder_score.py) #### Single inference @@ -821,7 +821,7 @@ You can pass multi-modal inputs to scoring models by passing `content` including print("Scoring output:", response_json["data"][0]["score"]) print("Scoring output:", response_json["data"][1]["score"]) ``` -Full example: [examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py](../../examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py) +Full example: [examples/pooling/score/openai_cross_encoder_score_for_multimodal.py](../../examples/pooling/score/openai_cross_encoder_score_for_multimodal.py) #### Extra parameters @@ -851,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with popular open-source tools. -Code example: [examples/online_serving/pooling/jinaai_rerank_client.py](../../examples/online_serving/pooling/jinaai_rerank_client.py) +Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py) #### Example Request diff --git a/examples/offline_inference/pooling/README.md b/examples/offline_inference/pooling/README.md deleted file mode 100644 index ad78be38716b6..0000000000000 --- a/examples/offline_inference/pooling/README.md +++ /dev/null @@ -1,57 +0,0 @@ -# Pooling models - -## Convert llm model to seq cls - -```bash -# for BAAI/bge-reranker-v2-gemma -# Caution: "Yes" and "yes" are two different tokens -python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls -# for mxbai-rerank-v2 -python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls -# for Qwen3-Reranker -python examples/offline_inference/pooling/convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls -``` - -## Embed jina_embeddings_v3 usage - -Only text matching task is supported for now. See <https://github.com/vllm-project/vllm/pull/16120> - -```bash -python examples/offline_inference/pooling/embed_jina_embeddings_v3.py -``` - -## Embed matryoshka dimensions usage - -```bash -python examples/offline_inference/pooling/embed_matryoshka_fy.py -``` - -## Multi vector retrieval usage - -```bash -python examples/offline_inference/pooling/multi_vector_retrieval.py -``` - -## Named Entity Recognition (NER) usage - -```bash -python examples/offline_inference/pooling/ner.py -``` - -## Prithvi Geospatial MAE usage - -```bash -python examples/offline_inference/pooling/prithvi_geospatial_mae.py -``` - -## IO Processor Plugins for Prithvi Geospatial MAE - -```bash -python examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py -``` - -## Qwen3 reranker usage - -```bash -python examples/offline_inference/pooling/qwen3_reranker.py -``` diff --git a/examples/online_serving/pooling/README.md b/examples/online_serving/pooling/README.md deleted file mode 100644 index b76ad21f04818..0000000000000 --- a/examples/online_serving/pooling/README.md +++ /dev/null @@ -1,97 +0,0 @@ -# Pooling models - -## Cohere rerank usage - -```bash -# vllm serve BAAI/bge-reranker-base -python examples/online_serving/pooling/cohere_rerank_client.py -``` - -## Embedding requests base64 encoding_format usage - -```bash -# vllm serve intfloat/e5-small -python examples/online_serving/pooling/embedding_requests_base64_client.py -``` - -## Embedding requests bytes encoding_format usage - -```bash -# vllm serve intfloat/e5-small -python examples/online_serving/pooling/embedding_requests_bytes_client.py -``` - -## Jinaai rerank usage - -```bash -# vllm serve BAAI/bge-reranker-base -python examples/online_serving/pooling/jinaai_rerank_client.py -``` - -## Multi vector retrieval usage - -```bash -# vllm serve BAAI/bge-m3 -python examples/online_serving/pooling/multi_vector_retrieval_client.py -``` - -## Named Entity Recognition (NER) usage - -```bash -# vllm serve boltuix/NeuroBERT-NER -python examples/online_serving/pooling/ner_client.py -``` - -## OpenAI chat embedding for multimodal usage - -```bash -python examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py -``` - -## OpenAI classification usage - -```bash -# vllm serve jason9693/Qwen2.5-1.5B-apeach -python examples/online_serving/pooling/openai_classification_client.py -``` - -## OpenAI cross_encoder score usage - -```bash -# vllm serve BAAI/bge-reranker-v2-m3 -python examples/online_serving/pooling/openai_cross_encoder_score.py -``` - -## OpenAI cross_encoder score for multimodal usage - -```bash -# vllm serve jinaai/jina-reranker-m0 -python examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py -``` - -## OpenAI embedding usage - -```bash -# vllm serve intfloat/e5-small -python examples/online_serving/pooling/openai_embedding_client.py -``` - -## OpenAI embedding matryoshka dimensions usage - -```bash -# vllm serve jinaai/jina-embeddings-v3 --trust-remote-code -python examples/online_serving/pooling/openai_embedding_matryoshka_fy.py -``` - -## OpenAI pooling usage - -```bash -# vllm serve internlm/internlm2-1_8b-reward --trust-remote-code -python examples/online_serving/pooling/openai_pooling_client.py -``` - -## Online Prithvi Geospatial MAE usage - -```bash -python examples/online_serving/pooling/prithvi_geospatial_mae.py -``` diff --git a/examples/online_serving/pooling/openai_classification_client.py b/examples/pooling/classify/openai_classification_client.py similarity index 100% rename from examples/online_serving/pooling/openai_classification_client.py rename to examples/pooling/classify/openai_classification_client.py diff --git a/examples/offline_inference/pooling/embed_jina_embeddings_v3.py b/examples/pooling/embed/embed_jina_embeddings_v3.py similarity index 100% rename from examples/offline_inference/pooling/embed_jina_embeddings_v3.py rename to examples/pooling/embed/embed_jina_embeddings_v3.py diff --git a/examples/offline_inference/pooling/embed_matryoshka_fy.py b/examples/pooling/embed/embed_matryoshka_fy.py similarity index 100% rename from examples/offline_inference/pooling/embed_matryoshka_fy.py rename to examples/pooling/embed/embed_matryoshka_fy.py diff --git a/examples/online_serving/pooling/embedding_requests_base64_client.py b/examples/pooling/embed/embedding_requests_base64_client.py similarity index 100% rename from examples/online_serving/pooling/embedding_requests_base64_client.py rename to examples/pooling/embed/embedding_requests_base64_client.py diff --git a/examples/online_serving/pooling/embedding_requests_bytes_client.py b/examples/pooling/embed/embedding_requests_bytes_client.py similarity index 100% rename from examples/online_serving/pooling/embedding_requests_bytes_client.py rename to examples/pooling/embed/embedding_requests_bytes_client.py diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py similarity index 100% rename from examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py rename to examples/pooling/embed/openai_chat_embedding_client_for_multimodal.py diff --git a/examples/online_serving/pooling/openai_embedding_client.py b/examples/pooling/embed/openai_embedding_client.py similarity index 100% rename from examples/online_serving/pooling/openai_embedding_client.py rename to examples/pooling/embed/openai_embedding_client.py diff --git a/examples/online_serving/openai_embedding_long_text/README.md b/examples/pooling/embed/openai_embedding_long_text/README.md similarity index 100% rename from examples/online_serving/openai_embedding_long_text/README.md rename to examples/pooling/embed/openai_embedding_long_text/README.md diff --git a/examples/online_serving/openai_embedding_long_text/client.py b/examples/pooling/embed/openai_embedding_long_text/client.py similarity index 100% rename from examples/online_serving/openai_embedding_long_text/client.py rename to examples/pooling/embed/openai_embedding_long_text/client.py diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/pooling/embed/openai_embedding_long_text/service.sh similarity index 100% rename from examples/online_serving/openai_embedding_long_text/service.sh rename to examples/pooling/embed/openai_embedding_long_text/service.sh diff --git a/examples/online_serving/pooling/openai_embedding_matryoshka_fy.py b/examples/pooling/embed/openai_embedding_matryoshka_fy.py similarity index 100% rename from examples/online_serving/pooling/openai_embedding_matryoshka_fy.py rename to examples/pooling/embed/openai_embedding_matryoshka_fy.py diff --git a/examples/online_serving/pooling/prithvi_geospatial_mae.py b/examples/pooling/plugin/prithvi_geospatial_mae_client.py similarity index 100% rename from examples/online_serving/pooling/prithvi_geospatial_mae.py rename to examples/pooling/plugin/prithvi_geospatial_mae_client.py diff --git a/examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py b/examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py similarity index 100% rename from examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py rename to examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py diff --git a/examples/offline_inference/pooling/prithvi_geospatial_mae.py b/examples/pooling/plugin/prithvi_geospatial_mae_offline.py similarity index 100% rename from examples/offline_inference/pooling/prithvi_geospatial_mae.py rename to examples/pooling/plugin/prithvi_geospatial_mae_offline.py diff --git a/examples/online_serving/pooling/openai_pooling_client.py b/examples/pooling/pooling/openai_pooling_client.py similarity index 100% rename from examples/online_serving/pooling/openai_pooling_client.py rename to examples/pooling/pooling/openai_pooling_client.py diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py similarity index 100% rename from examples/offline_inference/vision_language_pooling.py rename to examples/pooling/pooling/vision_language_pooling.py diff --git a/examples/online_serving/pooling/cohere_rerank_client.py b/examples/pooling/score/cohere_rerank_client.py similarity index 100% rename from examples/online_serving/pooling/cohere_rerank_client.py rename to examples/pooling/score/cohere_rerank_client.py diff --git a/examples/offline_inference/pooling/convert_model_to_seq_cls.py b/examples/pooling/score/convert_model_to_seq_cls.py similarity index 100% rename from examples/offline_inference/pooling/convert_model_to_seq_cls.py rename to examples/pooling/score/convert_model_to_seq_cls.py diff --git a/examples/online_serving/pooling/jinaai_rerank_client.py b/examples/pooling/score/jinaai_rerank_client.py similarity index 100% rename from examples/online_serving/pooling/jinaai_rerank_client.py rename to examples/pooling/score/jinaai_rerank_client.py diff --git a/examples/online_serving/pooling/openai_cross_encoder_score.py b/examples/pooling/score/openai_cross_encoder_score.py similarity index 100% rename from examples/online_serving/pooling/openai_cross_encoder_score.py rename to examples/pooling/score/openai_cross_encoder_score.py diff --git a/examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py b/examples/pooling/score/openai_cross_encoder_score_for_multimodal.py similarity index 100% rename from examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py rename to examples/pooling/score/openai_cross_encoder_score_for_multimodal.py diff --git a/examples/offline_inference/pooling/qwen3_reranker.py b/examples/pooling/score/qwen3_reranker.py similarity index 100% rename from examples/offline_inference/pooling/qwen3_reranker.py rename to examples/pooling/score/qwen3_reranker.py diff --git a/examples/offline_inference/pooling/ner.py b/examples/pooling/token_classify/ner.py similarity index 100% rename from examples/offline_inference/pooling/ner.py rename to examples/pooling/token_classify/ner.py diff --git a/examples/online_serving/pooling/ner_client.py b/examples/pooling/token_classify/ner_client.py similarity index 100% rename from examples/online_serving/pooling/ner_client.py rename to examples/pooling/token_classify/ner_client.py diff --git a/examples/offline_inference/pooling/multi_vector_retrieval.py b/examples/pooling/token_embed/multi_vector_retrieval.py similarity index 100% rename from examples/offline_inference/pooling/multi_vector_retrieval.py rename to examples/pooling/token_embed/multi_vector_retrieval.py diff --git a/examples/online_serving/pooling/multi_vector_retrieval_client.py b/examples/pooling/token_embed/multi_vector_retrieval_client.py similarity index 100% rename from examples/online_serving/pooling/multi_vector_retrieval_client.py rename to examples/pooling/token_embed/multi_vector_retrieval_client.py From 0ec84221718d920c3f46da879cc354f94b8fb59e Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Wed, 3 Dec 2025 00:03:52 +0800 Subject: [PATCH 26/45] [Bugfix] Fix incorrect channel order for idefics3 in edge case (#29881) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/models/idefics3.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 9c5f9389e54bb..7c3933c6feb7e 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -338,6 +338,7 @@ class Idefics3MultiModalProcessor(BaseMultiModalProcessor[Idefics3ProcessingInfo prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + mm_kwargs = {"input_data_format": "channels_last", **mm_kwargs} processed_outputs = super()._call_hf_processor( prompt, mm_data, From 52cb349fc010c3d9e8f576f7cc675e6403aadd0a Mon Sep 17 00:00:00 2001 From: Andrew Xia <axia@meta.com> Date: Tue, 2 Dec 2025 08:24:45 -0800 Subject: [PATCH 27/45] [responsesAPI][3] ResponsesParser to set up non harmony MCP (#29413) Signed-off-by: Andrew Xia <axia@fb.com> Co-authored-by: Andrew Xia <axia@fb.com> --- .../test_response_api_parsable_context.py | 87 +++++++++++++++ vllm/entrypoints/chat_utils.py | 1 + vllm/entrypoints/context.py | 76 +++++++++++++ vllm/entrypoints/openai/parser/__init__.py | 0 .../openai/parser/responses_parser.py | 101 ++++++++++++++++++ vllm/entrypoints/openai/serving_responses.py | 45 +++++--- vllm/entrypoints/responses_utils.py | 30 ++++++ vllm/envs.py | 5 + 8 files changed, 332 insertions(+), 13 deletions(-) create mode 100644 tests/entrypoints/openai/test_response_api_parsable_context.py create mode 100644 vllm/entrypoints/openai/parser/__init__.py create mode 100644 vllm/entrypoints/openai/parser/responses_parser.py diff --git a/tests/entrypoints/openai/test_response_api_parsable_context.py b/tests/entrypoints/openai/test_response_api_parsable_context.py new file mode 100644 index 0000000000000..1b2795770d4c7 --- /dev/null +++ b/tests/entrypoints/openai/test_response_api_parsable_context.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import pytest +import pytest_asyncio +from openai import OpenAI + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen3-8B" + + +@pytest.fixture(scope="module") +def server(): + args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"] + env_dict = dict( + VLLM_ENABLE_RESPONSES_API_STORE="1", + VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT="1", + # uncomment for tool calling + # PYTHON_EXECUTION_BACKEND="dangerously_use_uv", + ) + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_basic(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is 13 * 24?", + ) + assert response is not None + print("response: ", response) + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_reasoning_and_function_items(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input=[ + {"type": "message", "content": "Hello.", "role": "user"}, + { + "type": "reasoning", + "id": "lol", + "content": [ + { + "type": "reasoning_text", + "text": "We need to respond: greeting.", + } + ], + "summary": [], + }, + { + "arguments": '{"location": "Paris", "unit": "celsius"}', + "call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab", + "name": "get_weather", + "type": "function_call", + "id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78", + "status": "completed", + }, + { + "call_id": "call_5f7b38f3b81e4b8380fd0ba74f3ca3ab", + "id": "fc_4fe5d6fc5b6c4d6fa5f24cc80aa27f78", + "output": "The weather in Paris is 20 Celsius", + "status": "completed", + "type": "function_call_output", + }, + ], + temperature=0.0, + ) + assert response is not None + assert response.status == "completed" + # make sure we get a reasoning and text output + assert response.output[0].type == "reasoning" + assert response.output[1].type == "message" + assert type(response.output[1].content[0].text) is str diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 1643906894c66..2dd5b9c8f8aa0 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1530,6 +1530,7 @@ def _parse_chat_message_content( role = message["role"] content = message.get("content") reasoning = message.get("reasoning") or message.get("reasoning_content") + if content is None: content = [] elif isinstance(content, str): diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 7a41c668d7645..1260f65dba59a 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -5,6 +5,7 @@ import contextlib import json import logging from abc import ABC, abstractmethod +from collections.abc import Callable from contextlib import AsyncExitStack from typing import TYPE_CHECKING, Union @@ -17,9 +18,19 @@ from vllm.entrypoints.harmony_utils import ( get_streamable_parser_for_assistant, render_for_completion, ) +from vllm.entrypoints.openai.parser.responses_parser import ( + get_responses_parser_for_simple_context, +) +from vllm.entrypoints.openai.protocol import ( + ResponseInputOutputItem, + ResponsesRequest, +) +from vllm.entrypoints.responses_utils import construct_tool_dicts from vllm.entrypoints.tool import Tool from vllm.entrypoints.tool_server import ToolServer from vllm.outputs import RequestOutput +from vllm.reasoning.abs_reasoning_parsers import ReasoningParser +from vllm.transformers_utils.tokenizer import AnyTokenizer if TYPE_CHECKING: from mcp.client import ClientSession @@ -180,6 +191,71 @@ class SimpleContext(ConversationContext): raise NotImplementedError("Should not be called.") +class ParsableContext(ConversationContext): + def __init__( + self, + *, + response_messages: list[ResponseInputOutputItem], + tokenizer: AnyTokenizer, + reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser] | None, + request: ResponsesRequest, + ): + self.num_prompt_tokens = 0 + self.num_output_tokens = 0 + self.num_cached_tokens = 0 + # TODO: num_reasoning_tokens is not implemented yet. + self.num_reasoning_tokens = 0 + # not implemented yet for ParsableContext + self.all_turn_metrics: list[TurnMetrics] = [] + + if reasoning_parser_cls is None: + raise ValueError("reasoning_parser_cls must be provided.") + + self.parser = get_responses_parser_for_simple_context( + tokenizer=tokenizer, + reasoning_parser_cls=reasoning_parser_cls, + response_messages=response_messages, + request=request, + ) + + self._tool_sessions: dict[str, ClientSession | Tool] = {} + self.called_tools: set[str] = set() + + self.tool_dicts = construct_tool_dicts(request.tools, request.tool_choice) + + def append_output(self, output: RequestOutput) -> None: + self.num_prompt_tokens = len(output.prompt_token_ids or []) + self.num_cached_tokens = output.num_cached_tokens or 0 + self.num_output_tokens += len(output.outputs[0].token_ids or []) + self.parser.process(output.outputs[0]) + + def append_tool_output(self, output: list[ResponseInputOutputItem]) -> None: + raise NotImplementedError("Should not be called.") + + def need_builtin_tool_call(self) -> bool: + """Return true if the last message is a MCP tool call""" + return False + + async def call_tool(self) -> list[ResponseInputOutputItem]: + raise NotImplementedError("Should not be called.") + + def render_for_completion(self): + raise NotImplementedError("Should not be called.") + + async def init_tool_sessions( + self, + tool_server: ToolServer | None, + exit_stack: AsyncExitStack, + request_id: str, + mcp_tools: dict[str, Mcp], + ): + pass + + async def cleanup_session(self, *args, **kwargs) -> None: + """Can be used as coro to used in __aexit__""" + raise NotImplementedError("Should not be called.") + + class HarmonyContext(ConversationContext): def __init__( self, diff --git a/vllm/entrypoints/openai/parser/__init__.py b/vllm/entrypoints/openai/parser/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py new file mode 100644 index 0000000000000..1bc8e81bd9dfc --- /dev/null +++ b/vllm/entrypoints/openai/parser/responses_parser.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import logging +from collections.abc import Callable + +from openai.types.responses.response_output_message import ResponseOutputMessage +from openai.types.responses.response_output_text import ResponseOutputText +from openai.types.responses.response_reasoning_item import ( + Content, + ResponseReasoningItem, +) + +from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest +from vllm.outputs import CompletionOutput +from vllm.reasoning.abs_reasoning_parsers import ReasoningParser +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import random_uuid + +logger = logging.getLogger(__name__) + + +class ResponsesParser: + """Incremental parser over completion tokens with reasoning support.""" + + def __init__( + self, + *, + tokenizer: AnyTokenizer, + reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser], + response_messages: list[ResponseInputOutputItem], + request: ResponsesRequest, + ): + self.response_messages: list[ResponseInputOutputItem] = ( + # TODO: initial messages may not be properly typed + response_messages + ) + self.num_init_messages = len(response_messages) + self.tokenizer = tokenizer + self.request = request + + self.reasoning_parser_instance = reasoning_parser_cls(tokenizer) + + def process(self, output: CompletionOutput) -> "ResponsesParser": + reasoning_content, content = self.reasoning_parser_instance.extract_reasoning( + output.text, request=self.request + ) + if reasoning_content: + self.response_messages.append( + ResponseReasoningItem( + type="reasoning", + id=f"rs_{random_uuid()}", + summary=[], + content=[ + Content( + type="reasoning_text", + text=reasoning_content, + ) + ], + ) + ) + + if content: + self.response_messages.append( + ResponseOutputMessage( + type="message", + id=f"msg_{random_uuid()}", + status="completed", + role="assistant", + content=[ + ResponseOutputText( + annotations=[], # TODO + type="output_text", + text=content, + logprobs=None, # TODO + ) + ], + ) + ) + + return self + + +def get_responses_parser_for_simple_context( + *, + tokenizer: AnyTokenizer, + reasoning_parser_cls: Callable[[AnyTokenizer], ReasoningParser], + response_messages: list[ResponseInputOutputItem], + request: ResponsesRequest, +) -> ResponsesParser: + """Factory function to create a ResponsesParser with + optional reasoning parser. + + Returns: + ResponsesParser instance configured with the provided parser + """ + return ResponsesParser( + tokenizer=tokenizer, + reasoning_parser_cls=reasoning_parser_cls, + response_messages=response_messages, + request=request, + ) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 81495a0777546..5ad86194ce1b2 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -60,6 +60,7 @@ from vllm.entrypoints.chat_utils import ( from vllm.entrypoints.context import ( ConversationContext, HarmonyContext, + ParsableContext, SimpleContext, StreamingHarmonyContext, ) @@ -96,8 +97,9 @@ from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.responses_utils import ( construct_input_messages, - convert_tool_responses_to_completions_format, + construct_tool_dicts, extract_tool_types, + make_response_output_items_from_parsable_context, ) from vllm.entrypoints.tool_server import ToolServer from vllm.inputs.data import TokensPrompt as EngineTokensPrompt @@ -228,7 +230,6 @@ class OpenAIServingResponses(OpenAIServing): self.tool_parser = self._get_tool_parser( tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools ) - self.exclude_tools_when_tool_choice_none = False # HACK(woosuk): This is a hack. We should use a better store. # FIXME: If enable_store=True, this may cause a memory leak since we # never remove responses from the store. @@ -413,7 +414,17 @@ class OpenAIServingResponses(OpenAIServing): else: context = HarmonyContext(messages, available_tools) else: - context = SimpleContext() + if envs.VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT: + # This is an feature in development for parsing + # tokens during generation instead of at the end + context = ParsableContext( + response_messages=messages, + tokenizer=tokenizer, + reasoning_parser_cls=self.reasoning_parser, + request=request, + ) + else: + context = SimpleContext() if self.reasoning_parser is not None: reasoning_parser = self.reasoning_parser(tokenizer) @@ -534,15 +545,7 @@ class OpenAIServingResponses(OpenAIServing): prev_response: ResponsesResponse | None, tokenizer: TokenizerLike, ): - if request.tools is None or ( - request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none - ): - tool_dicts = None - else: - tool_dicts = [ - convert_tool_responses_to_completions_format(tool.model_dump()) - for tool in request.tools - ] + tool_dicts = construct_tool_dicts(request.tools, request.tool_choice) # Construct the input messages. messages = construct_input_messages( request_instructions=request.instructions, @@ -642,6 +645,22 @@ class OpenAIServingResponses(OpenAIServing): status = "cancelled" else: status = "incomplete" + elif isinstance(context, ParsableContext): + response_messages = context.parser.response_messages[ + context.parser.num_init_messages : + ] + output = make_response_output_items_from_parsable_context(response_messages) + + # TODO: context for non-gptoss models doesn't use messages + # so we can't get them out yet + if request.enable_response_messages: + raise NotImplementedError( + "enable_response_messages is currently only supported for gpt-oss" + ) + + # TODO: Calculate usage. + # assert final_res.prompt_token_ids is not None + num_tool_output_tokens = 0 else: assert isinstance(context, SimpleContext) final_res = context.last_output @@ -661,7 +680,7 @@ class OpenAIServingResponses(OpenAIServing): assert final_res.prompt_token_ids is not None num_tool_output_tokens = 0 - assert isinstance(context, (SimpleContext, HarmonyContext)) + assert isinstance(context, (SimpleContext, HarmonyContext, ParsableContext)) num_prompt_tokens = context.num_prompt_tokens num_generated_tokens = context.num_output_tokens num_cached_tokens = context.num_cached_tokens diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py index 2e01cb038af85..5f21e2c44450c 100644 --- a/vllm/entrypoints/responses_utils.py +++ b/vllm/entrypoints/responses_utils.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + from openai.types.chat import ( ChatCompletionAssistantMessageParam, ChatCompletionMessageToolCallParam, @@ -10,6 +12,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import ( Function as FunctionCallTool, ) from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem +from openai.types.responses.response import ToolChoice from openai.types.responses.response_function_tool_call_output_item import ( ResponseFunctionToolCallOutputItem, ) @@ -24,6 +27,20 @@ from vllm.entrypoints.openai.protocol import ( ) +def make_response_output_items_from_parsable_context( + response_messages: list[ResponseInputOutputItem], +) -> list[ResponseOutputItem]: + """Given a list of sentences, construct ResponseOutput Items.""" + output_messages: list[ResponseOutputItem] = [] + for message in response_messages: + if not isinstance(message, ResponseFunctionToolCallOutputItem): + output_messages.append(message) + else: + raise NotImplementedError("tool calls not supported for response context") + + return output_messages + + def construct_input_messages( *, request_instructions: str | None = None, @@ -146,3 +163,16 @@ def convert_tool_responses_to_completions_format(tool: dict) -> dict: "type": "function", "function": tool, } + + +def construct_tool_dicts( + tools: list[Tool], tool_choice: ToolChoice +) -> list[dict[str, Any]] | None: + if tools is None or (tool_choice == "none"): + tool_dicts = None + else: + tool_dicts = [ + convert_tool_responses_to_completions_format(tool.model_dump()) + for tool in tools + ] + return tool_dicts diff --git a/vllm/envs.py b/vllm/envs.py index d0912863e6444..8b954fa14f28c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -214,6 +214,7 @@ if TYPE_CHECKING: VLLM_ALLREDUCE_USE_SYMM_MEM: bool = True VLLM_TUNED_CONFIG_FOLDER: str | None = None VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS: set[str] = set() + VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT: bool = False VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: bool = False VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False @@ -1444,6 +1445,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_ALLREDUCE_USE_SYMM_MEM": lambda: bool( int(os.getenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1")) ), + # Experimental: use this to enable MCP tool calling for non harmony models + "VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT": lambda: bool( + int(os.getenv("VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT", "0")) + ), # Allows vllm to find tuned config under customized folder "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None), # Valid values are container,code_interpreter,web_search_preview From 63b1da76ba35cd8cb220c79c44556e07fa4fb0c6 Mon Sep 17 00:00:00 2001 From: Isotr0py <mozf@mail2.sysu.edu.cn> Date: Wed, 3 Dec 2025 01:33:23 +0800 Subject: [PATCH 28/45] [Chore]: Reorganize gguf utils funtions under `transformers_utils` (#29891) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> --- tests/models/test_gguf_download.py | 2 +- tests/transformers_utils/test_utils.py | 12 +++-- vllm/config/model.py | 8 ++- vllm/engine/arg_utils.py | 3 +- vllm/tokenizers/registry.py | 6 +-- vllm/transformers_utils/config.py | 14 ++--- vllm/transformers_utils/gguf_utils.py | 71 +++++++++++++++++++++++++ vllm/transformers_utils/processor.py | 3 +- vllm/transformers_utils/utils.py | 72 -------------------------- 9 files changed, 96 insertions(+), 95 deletions(-) diff --git a/tests/models/test_gguf_download.py b/tests/models/test_gguf_download.py index 155768ac9bff7..b1674cdf77178 100644 --- a/tests/models/test_gguf_download.py +++ b/tests/models/test_gguf_download.py @@ -203,7 +203,7 @@ class TestGGUFModelLoader: @patch("vllm.config.model.get_hf_image_processor_config", return_value=None) @patch("vllm.config.model.get_config") @patch("vllm.config.model.is_gguf", return_value=False) - @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False) + @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False) @patch("os.path.isfile", return_value=False) def test_prepare_weights_invalid_format( self, diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py index a8d0b9be9ec29..0a6a65b4133c9 100644 --- a/tests/transformers_utils/test_utils.py +++ b/tests/transformers_utils/test_utils.py @@ -5,13 +5,15 @@ from unittest.mock import patch import pytest +from vllm.transformers_utils.gguf_utils import ( + is_gguf, + is_remote_gguf, + split_remote_gguf, +) from vllm.transformers_utils.utils import ( is_cloud_storage, is_gcs, - is_gguf, - is_remote_gguf, is_s3, - split_remote_gguf, ) @@ -132,7 +134,7 @@ class TestSplitRemoteGGUF: class TestIsGGUF: """Test is_gguf utility function.""" - @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=True) + @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=True) def test_is_gguf_with_local_file(self, mock_check_gguf): """Test is_gguf with local GGUF file.""" assert is_gguf("/path/to/model.gguf") @@ -149,7 +151,7 @@ class TestIsGGUF: assert not is_gguf("repo/model:quant") assert not is_gguf("repo/model:INVALID") - @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False) + @patch("vllm.transformers_utils.gguf_utils.check_gguf_file", return_value=False) def test_is_gguf_false(self, mock_check_gguf): """Test is_gguf returns False for non-GGUF models.""" assert not is_gguf("unsloth/Qwen3-0.6B") diff --git a/vllm/config/model.py b/vllm/config/model.py index ef592ac001535..5de97697698a1 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -37,15 +37,13 @@ from vllm.transformers_utils.config import ( uses_xdrope_dim, ) from vllm.transformers_utils.gguf_utils import ( - maybe_patch_hf_config_from_gguf, -) -from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri -from vllm.transformers_utils.utils import ( is_gguf, is_remote_gguf, - maybe_model_redirect, + maybe_patch_hf_config_from_gguf, split_remote_gguf, ) +from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri +from vllm.transformers_utils.utils import maybe_model_redirect from vllm.utils.import_utils import LazyLoader from vllm.utils.torch_utils import common_broadcastable_dtype diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 5a2836668174f..83029e09ceaad 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -86,8 +86,9 @@ from vllm.transformers_utils.config import ( is_interleaved, maybe_override_with_speculators, ) +from vllm.transformers_utils.gguf_utils import is_gguf from vllm.transformers_utils.repo_utils import get_model_path -from vllm.transformers_utils.utils import is_cloud_storage, is_gguf +from vllm.transformers_utils.utils import is_cloud_storage from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.mem_constants import GiB_bytes from vllm.utils.network_utils import get_ip diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py index bf9d295de23ae..87048f2ec7845 100644 --- a/vllm/tokenizers/registry.py +++ b/vllm/tokenizers/registry.py @@ -11,14 +11,14 @@ from typing_extensions import assert_never import vllm.envs as envs from vllm.logger import init_logger -from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf -from vllm.transformers_utils.repo_utils import list_filtered_repo_files -from vllm.transformers_utils.utils import ( +from vllm.transformers_utils.gguf_utils import ( check_gguf_file, + get_gguf_file_path_from_hf, is_gguf, is_remote_gguf, split_remote_gguf, ) +from vllm.transformers_utils.repo_utils import list_filtered_repo_files from vllm.utils.import_utils import resolve_obj_by_qualname from .protocol import TokenizerLike diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 1bb5791e19016..0cceab90ba9a2 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -26,8 +26,15 @@ from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME from vllm import envs from vllm.logger import init_logger +from vllm.transformers_utils.utils import parse_safetensors_file_metadata from .config_parser_base import ConfigParserBase +from .gguf_utils import ( + check_gguf_file, + is_gguf, + is_remote_gguf, + split_remote_gguf, +) from .repo_utils import ( _get_hf_token, file_or_path_exists, @@ -36,13 +43,6 @@ from .repo_utils import ( try_get_local_file, with_retry, ) -from .utils import ( - check_gguf_file, - is_gguf, - is_remote_gguf, - parse_safetensors_file_metadata, - split_remote_gguf, -) if envs.VLLM_USE_MODELSCOPE: from modelscope import AutoConfig diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py index cb1fc2d092e01..f3fd43c6ace51 100644 --- a/vllm/transformers_utils/gguf_utils.py +++ b/vllm/transformers_utils/gguf_utils.py @@ -2,10 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """GGUF utility functions.""" +from functools import cache +from os import PathLike from pathlib import Path import gguf +import regex as re from gguf.constants import Keys, VisionProjectorType +from gguf.quants import GGMLQuantizationType from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig from vllm.logger import init_logger @@ -15,6 +19,73 @@ from .repo_utils import list_filtered_repo_files logger = init_logger(__name__) +@cache +def check_gguf_file(model: str | PathLike) -> bool: + """Check if the file is a GGUF model.""" + model = Path(model) + if not model.is_file(): + return False + elif model.suffix == ".gguf": + return True + + try: + with model.open("rb") as f: + header = f.read(4) + + return header == b"GGUF" + except Exception as e: + logger.debug("Error reading file %s: %s", model, e) + return False + + +@cache +def is_remote_gguf(model: str | Path) -> bool: + """Check if the model is a remote GGUF model.""" + pattern = r"^[a-zA-Z0-9][a-zA-Z0-9._-]*/[a-zA-Z0-9][a-zA-Z0-9._-]*:[A-Za-z0-9_+-]+$" + model = str(model) + if re.fullmatch(pattern, model): + _, quant_type = model.rsplit(":", 1) + return is_valid_gguf_quant_type(quant_type) + return False + + +def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool: + """Check if the quant type is a valid GGUF quant type.""" + return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None + + +def split_remote_gguf(model: str | Path) -> tuple[str, str]: + """Split the model into repo_id and quant type.""" + model = str(model) + if is_remote_gguf(model): + parts = model.rsplit(":", 1) + return (parts[0], parts[1]) + raise ValueError( + f"Wrong GGUF model or invalid GGUF quant type: {model}.\n" + "- It should be in repo_id:quant_type format.\n" + f"- Valid GGMLQuantizationType values: {GGMLQuantizationType._member_names_}", + ) + + +def is_gguf(model: str | Path) -> bool: + """Check if the model is a GGUF model. + + Args: + model: Model name, path, or Path object to check. + + Returns: + True if the model is a GGUF model, False otherwise. + """ + model = str(model) + + # Check if it's a local GGUF file + if check_gguf_file(model): + return True + + # Check if it's a remote GGUF model (repo_id:quant_type format) + return is_remote_gguf(model) + + def detect_gguf_multimodal(model: str) -> Path | None: """Check if GGUF model has multimodal projector file. diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index 63cdf63370342..e9864b0c1531d 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -18,7 +18,8 @@ from transformers.processing_utils import ProcessorMixin from transformers.video_processing_utils import BaseVideoProcessor from typing_extensions import TypeVar -from vllm.transformers_utils.utils import convert_model_repo_to_path, is_gguf +from vllm.transformers_utils.gguf_utils import is_gguf +from vllm.transformers_utils.utils import convert_model_repo_to_path from vllm.utils.func_utils import get_allowed_kwarg_only_overrides if TYPE_CHECKING: diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index 45a873c9f7001..96f292f4c949e 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -9,8 +9,6 @@ from os import PathLike from pathlib import Path from typing import Any -from gguf import GGMLQuantizationType - import vllm.envs as envs from vllm.logger import init_logger @@ -29,76 +27,6 @@ def is_cloud_storage(model_or_path: str) -> bool: return is_s3(model_or_path) or is_gcs(model_or_path) -@cache -def check_gguf_file(model: str | PathLike) -> bool: - """Check if the file is a GGUF model.""" - model = Path(model) - if not model.is_file(): - return False - elif model.suffix == ".gguf": - return True - - try: - with model.open("rb") as f: - header = f.read(4) - - return header == b"GGUF" - except Exception as e: - logger.debug("Error reading file %s: %s", model, e) - return False - - -@cache -def is_remote_gguf(model: str | Path) -> bool: - """Check if the model is a remote GGUF model.""" - model = str(model) - return ( - (not is_cloud_storage(model)) - and (not model.startswith(("http://", "https://"))) - and ("/" in model and ":" in model) - and is_valid_gguf_quant_type(model.rsplit(":", 1)[1]) - ) - - -def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool: - """Check if the quant type is a valid GGUF quant type.""" - return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None - - -def split_remote_gguf(model: str | Path) -> tuple[str, str]: - """Split the model into repo_id and quant type.""" - model = str(model) - if is_remote_gguf(model): - parts = model.rsplit(":", 1) - return (parts[0], parts[1]) - raise ValueError( - "Wrong GGUF model or invalid GGUF quant type: %s.\n" - "- It should be in repo_id:quant_type format.\n" - "- Valid GGMLQuantizationType values: %s", - model, - GGMLQuantizationType._member_names_, - ) - - -def is_gguf(model: str | Path) -> bool: - """Check if the model is a GGUF model. - - Args: - model: Model name, path, or Path object to check. - - Returns: - True if the model is a GGUF model, False otherwise. - """ - model = str(model) - - # Check if it's a local GGUF file - if check_gguf_file(model): - return True - - # Check if it's a remote GGUF model (repo_id:quant_type format) - return is_remote_gguf(model) - - def modelscope_list_repo_files( repo_id: str, revision: str | None = None, From c77b9929a04c56d369c9f6b86fbf5d4891bab285 Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Tue, 2 Dec 2025 11:52:54 -0600 Subject: [PATCH 29/45] Update AMD-CI testing mirror (as of 2025-12-02) (#29898) Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com> --- .buildkite/test-amd.yaml | 43 ++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index d5d4043a1d5bc..67088caa8150b 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -51,7 +51,7 @@ steps: - label: Async Engine, Inputs, Utils, Worker Test # 10min timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 + agent_pool: mi355_1 grade: Blocking source_file_dependencies: - vllm/ @@ -64,7 +64,7 @@ steps: - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 + agent_pool: mi355_1 grade: Blocking source_file_dependencies: - vllm/ @@ -99,7 +99,7 @@ steps: - label: Basic Correctness Test # 20min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 + agent_pool: mi355_1 # grade: Blocking fast_check: true torch_nightly: true @@ -116,7 +116,7 @@ steps: - label: Entrypoints Unit Tests # 5min mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 + agent_pool: mi355_1 grade: Blocking timeout_in_minutes: 10 working_dir: "/vllm-workspace/tests" @@ -131,7 +131,7 @@ steps: - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 + agent_pool: mi355_1 # grade: Blocking working_dir: "/vllm-workspace/tests" fast_check: true @@ -254,7 +254,7 @@ steps: - label: EPLB Algorithm Test # 5min mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 + agent_pool: mi355_1 grade: Blocking timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" @@ -266,7 +266,7 @@ steps: - label: EPLB Execution Test # 10min mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 + agent_pool: mi355_4 # grade: Blocking timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" @@ -281,7 +281,7 @@ steps: - label: Metrics, Tracing Test # 12min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_2 + agent_pool: mi355_2 # grade: Blocking num_gpus: 2 source_file_dependencies: @@ -301,7 +301,7 @@ steps: - label: Regression Test # 7min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 + agent_pool: mi355_1 grade: Blocking source_file_dependencies: - vllm/ @@ -343,7 +343,7 @@ steps: - label: V1 Test entrypoints # 35min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi325_1 + agent_pool: mi355_1 grade: Blocking source_file_dependencies: - vllm/ @@ -544,7 +544,7 @@ steps: - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 + agent_pool: mi355_1 # grade: Blocking torch_nightly: true source_file_dependencies: @@ -715,6 +715,7 @@ steps: # we can only upgrade after this is resolved # TODO(jerryzh168): resolve the above comment - uv pip install --system torchao==0.13.0 + - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - label: LM Eval Small Models # 15min @@ -934,6 +935,18 @@ steps: commands: - pytest -v -s models/language/pooling_mteb_test +- label: Multi-Modal Processor Test (CPU) + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + no_gpu: true + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + - label: Multi-Modal Processor Test # 44min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] @@ -1472,14 +1485,14 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - pytest -v -s tests/compile/distributed/test_async_tp.py + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - - pytest -v -s tests/distributed/test_sequence_parallel.py + - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### From 2d613de9aef3ef25e9adc8887ac0388da092b500 Mon Sep 17 00:00:00 2001 From: Benjamin Bartels <benjamin@bartels.dev> Date: Tue, 2 Dec 2025 18:21:49 +0000 Subject: [PATCH 30/45] [CI/Build] Fixes missing runtime dependencies (#29822) Signed-off-by: bbartels <benjamin@bartels.dev> --- docker/Dockerfile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index eb7c105071c00..006481b23cb9f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -364,7 +364,12 @@ RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \ cuda-cudart-${CUDA_VERSION_DASH} \ cuda-nvrtc-${CUDA_VERSION_DASH} \ cuda-cuobjdump-${CUDA_VERSION_DASH} \ - libcublas-${CUDA_VERSION_DASH} && \ + # https://github.com/vllm-project/vllm/issues/29590 + libcurand-dev-${CUDA_VERSION_DASH} \ + libcublas-${CUDA_VERSION_DASH} \ + # Fixes nccl_allocator requiring nccl.h at runtime + # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22 + libnccl-dev && \ rm -rf /var/lib/apt/lists/* ARG PIP_INDEX_URL UV_INDEX_URL From 1d93f116754f6e81acb9287ebcca0d1d1170a944 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni <mbonanni@redhat.com> Date: Tue, 2 Dec 2025 13:48:08 -0500 Subject: [PATCH 31/45] [Attention][CUDAGraph] Remove CG padding from attention backends (#29352) Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> --- .../layers/mamba/mamba_mixer.py | 17 +++++++------- vllm/v1/attention/backends/gdn_attn.py | 22 +++++-------------- vllm/v1/attention/backends/mamba1_attn.py | 12 +++------- vllm/v1/attention/backends/mamba2_attn.py | 12 +++------- vllm/v1/attention/backends/short_conv_attn.py | 3 +-- 5 files changed, 20 insertions(+), 46 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 90e520e244416..0b63acf2dc5a5 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -252,7 +252,6 @@ class MambaMixer(MambaBase, CustomOp): conv_state = self_kv_cache[0].transpose(-1, -2) ssm_state = self_kv_cache[1] has_initial_states_p = attn_metadata.has_initial_states_p - num_padded_decodes = attn_metadata.num_padded_decodes # 1. Gated MLP's linear projection projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) @@ -281,7 +280,7 @@ class MambaMixer(MambaBase, CustomOp): state_indices_tensor, num_prefill_tokens, num_prefills, - num_padded_decodes, + num_decode_tokens, ) hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d @@ -470,24 +469,24 @@ def split_batch_to_prefill_and_decode( state_indices_tensor: torch.Tensor, num_prefill_tokens: int, num_prefills: int, - num_padded_decodes: int, + num_decode_tokens: int, ) -> PrefillDecodeSplit: - num_actual_tokens = num_prefill_tokens + num_padded_decodes + num_actual_tokens = num_prefill_tokens + num_decode_tokens # In v1, decode tokens come first, then prefill tokens. hidden_states_BC_d, hidden_states_BC_p = torch.split( hidden_states_BC[..., :num_actual_tokens], - [num_padded_decodes, num_prefill_tokens], + [num_decode_tokens, num_prefill_tokens], dim=-1, ) gate_d, gate_p = torch.split( - gate[..., :num_actual_tokens], [num_padded_decodes, num_prefill_tokens], dim=-1 + gate[..., :num_actual_tokens], [num_decode_tokens, num_prefill_tokens], dim=-1 ) - # num_padded_decodes accounts for CUDA graph padding when applicable + # num_decode_tokens accounts for CUDA graph padding when applicable state_indices_tensor_d, state_indices_tensor_p = torch.split( - state_indices_tensor[: num_padded_decodes + num_prefills], - [num_padded_decodes, num_prefills], + state_indices_tensor[: num_decode_tokens + num_prefills], + [num_decode_tokens, num_prefills], dim=0, ) diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index 69b5a6fb48564..e921f8c3de073 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -254,17 +254,11 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] ) else: has_initial_state = None - num_actual_tokens = ( - num_prefill_tokens + num_decode_tokens + num_spec_decode_tokens - ) - # prepare tensors for cudagraph - # - # With speculative decoding, the xgrammar backend may rollback tokens - # and causing some sequences has less draft tokens than self.num_spec. - # - # In above cases, the max possible batch size for n tokens, can be - # min(n, cudagraph_max_bs). + # Prepare tensors for cudagraph + # Note: m.num_actual_tokens is already padded by the model runner for CUDAGraph + batch_size = m.num_actual_tokens + if ( self.use_full_cuda_graph and num_prefills == 0 @@ -272,9 +266,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] and num_spec_decodes <= self.decode_cudagraph_max_bs and num_spec_decode_tokens <= self.decode_cudagraph_max_bs ): - num_actual_tokens = self.vllm_config.pad_for_cudagraph(m.num_actual_tokens) - batch_size = min(self.decode_cudagraph_max_bs, num_actual_tokens) - self.spec_state_indices_tensor[:num_spec_decodes].copy_( spec_state_indices_tensor, non_blocking=True ) @@ -319,9 +310,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] and num_spec_decodes == 0 and num_decodes <= self.decode_cudagraph_max_bs ): - num_actual_tokens = self.vllm_config.pad_for_cudagraph(m.num_actual_tokens) - batch_size = num_actual_tokens - self.non_spec_state_indices_tensor[:num_decodes].copy_( non_spec_state_indices_tensor, non_blocking=True ) @@ -344,7 +332,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] num_decode_tokens=num_decode_tokens, num_spec_decodes=num_spec_decodes, num_spec_decode_tokens=num_spec_decode_tokens, - num_actual_tokens=num_actual_tokens, + num_actual_tokens=m.num_actual_tokens, has_initial_state=has_initial_state, spec_query_start_loc=spec_query_start_loc, non_spec_query_start_loc=non_spec_query_start_loc, diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py index 8e949e53330c1..fcda6134016ba 100644 --- a/vllm/v1/attention/backends/mamba1_attn.py +++ b/vllm/v1/attention/backends/mamba1_attn.py @@ -31,7 +31,6 @@ class Mamba1AttentionMetadata: num_prefill_tokens: int num_decodes: int num_decode_tokens: int - num_padded_decodes: int block_idx_last_scheduled_token: torch.Tensor # shape: [batch,] block_idx_first_scheduled_token_p: torch.Tensor # shape: [batch,] @@ -68,7 +67,6 @@ class Mamba1AttentionMetadataBuilder( has_initial_states_p = None query_start_loc_p = None - padded_decodes = num_decodes num_computed_tokens, num_computed_tokens_p = None, None block_idx_first_scheduled_token = None block_idx_first_scheduled_token_p = None @@ -125,11 +123,10 @@ class Mamba1AttentionMetadataBuilder( and num_decodes <= self.decode_cudagraph_max_bs and self.compilation_config.cudagraph_mode.has_full_cudagraphs() ): - padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes) self.state_indices_tensor[:num_decodes].copy_( state_indices_tensor, non_blocking=True ) - state_indices_tensor = self.state_indices_tensor[:padded_decodes] + state_indices_tensor = self.state_indices_tensor[:num_decode_tokens] state_indices_tensor[num_decodes:] = PAD_SLOT_ID if self.vllm_config.cache_config.enable_prefix_caching: @@ -137,17 +134,15 @@ class Mamba1AttentionMetadataBuilder( block_idx_last_scheduled_token, non_blocking=True ) block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[ - :padded_decodes + :num_decode_tokens ] - block_idx_last_scheduled_token[num_decodes:] = 0 self.block_idx_last_computed_token[:num_decodes].copy_( block_idx_last_computed_token, non_blocking=True ) block_idx_last_computed_token = self.block_idx_last_computed_token[ - :padded_decodes + :num_decode_tokens ] - block_idx_last_computed_token[num_decodes:] = 0 return Mamba1AttentionMetadata( query_start_loc_p=query_start_loc_p, @@ -157,7 +152,6 @@ class Mamba1AttentionMetadataBuilder( num_prefill_tokens=num_prefill_tokens, num_decodes=num_decodes, num_decode_tokens=num_decode_tokens, - num_padded_decodes=padded_decodes, block_idx_last_scheduled_token=block_idx_last_scheduled_token, block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p, block_idx_last_computed_token=block_idx_last_computed_token, diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index 888734e5d2b6b..bf1d8f09ab0ac 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -10,7 +10,6 @@ from vllm.config import VllmConfig from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( - PAD_SLOT_ID, CommonAttentionMetadata, compute_causal_conv1d_metadata, split_decodes_and_prefills, @@ -304,30 +303,25 @@ class Mamba2AttentionMetadataBuilder( num_decodes <= self.decode_cudagraph_max_bs and self.compilation_config.cudagraph_mode.has_full_cudagraphs() ): - # Pad state tensor for CUDA graph - num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes) self.state_indices_tensor[:num_decodes].copy_( state_indices_tensor, non_blocking=True ) - state_indices_tensor = self.state_indices_tensor[:num_input_tokens] - state_indices_tensor[num_decodes:] = PAD_SLOT_ID + state_indices_tensor = self.state_indices_tensor[:num_decode_tokens] if self.vllm_config.cache_config.enable_prefix_caching: self.block_idx_last_scheduled_token[:num_decodes].copy_( block_idx_last_scheduled_token, non_blocking=True ) block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[ - :num_input_tokens + :num_decode_tokens ] - block_idx_last_scheduled_token[num_decodes:] = 0 self.block_idx_last_computed_token[:num_decodes].copy_( block_idx_last_computed_token, non_blocking=True ) block_idx_last_computed_token = self.block_idx_last_computed_token[ - :num_input_tokens + :num_decode_tokens ] - block_idx_last_computed_token[num_decodes:] = 0 attn_metadata = Mamba2AttentionMetadata( num_prefills=num_prefills, diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py index de0cb73db0917..c8fe0faf71088 100644 --- a/vllm/v1/attention/backends/short_conv_attn.py +++ b/vllm/v1/attention/backends/short_conv_attn.py @@ -83,11 +83,10 @@ class ShortConvAttentionMetadataBuilder( and num_decodes <= self.decode_cudagraph_max_bs and self.compilation_config.cudagraph_mode.has_full_cudagraphs() ): - num_input_tokens = self.vllm_config.pad_for_cudagraph(num_decodes) self.state_indices_tensor[:num_decodes].copy_( state_indices_tensor, non_blocking=True ) - state_indices_tensor = self.state_indices_tensor[:num_input_tokens] + state_indices_tensor = self.state_indices_tensor[:num_decode_tokens] state_indices_tensor[num_decodes:] = PAD_SLOT_ID attn_metadata = ShortConvAttentionMetadata( From a2b053dc858db461d8d98cff37ee7c67ba21126b Mon Sep 17 00:00:00 2001 From: Navanit Dubey <98005188+Navanit-git@users.noreply.github.com> Date: Wed, 3 Dec 2025 00:58:35 +0530 Subject: [PATCH 32/45] feat(model): Add BitsAndBytes quantization support for Qwen3-Omni-MoE (#29896) Signed-off-by: navanit-git <navanitdubey@gmail.com> --- .../models/qwen3_omni_moe_thinker.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 39dd42552ae8f..fe825198dcaa4 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -62,6 +62,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems @@ -1137,6 +1138,18 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( } ) + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): @@ -1763,3 +1776,13 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( mrope_position_delta = llm_positions.max() + 1 - seq_len return llm_positions, mrope_position_delta + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="visual.merger", + tower_model=["visual.", "audio_tower."], + ) From 1c593e117d3a818815b5d07992a096d53b519a15 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Tue, 2 Dec 2025 20:40:56 +0000 Subject: [PATCH 33/45] Fix boolean nested params, add dict format support, and enhance plotting for vllm bench sweep (#29025) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <luka.govedic@gmail.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> --- tests/benchmarks/test_param_sweep.py | 257 ++++++++++++++++++++++++++ tests/benchmarks/test_plot_filters.py | 171 +++++++++++++++++ vllm/benchmarks/sweep/param_sweep.py | 85 ++++++++- vllm/benchmarks/sweep/plot.py | 109 ++++++++++- vllm/benchmarks/sweep/serve.py | 14 +- 5 files changed, 614 insertions(+), 22 deletions(-) create mode 100644 tests/benchmarks/test_param_sweep.py create mode 100644 tests/benchmarks/test_plot_filters.py diff --git a/tests/benchmarks/test_param_sweep.py b/tests/benchmarks/test_param_sweep.py new file mode 100644 index 0000000000000..0d47cfd9d6230 --- /dev/null +++ b/tests/benchmarks/test_param_sweep.py @@ -0,0 +1,257 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +import tempfile +from pathlib import Path + +import pytest + +from vllm.benchmarks.sweep.param_sweep import ParameterSweep, ParameterSweepItem + + +class TestParameterSweepItem: + """Test ParameterSweepItem functionality.""" + + @pytest.mark.parametrize( + "input_dict,expected", + [ + ( + {"compilation_config.use_inductor_graph_partition": False}, + "--compilation-config.use_inductor_graph_partition=false", + ), + ( + {"compilation_config.use_inductor_graph_partition": True}, + "--compilation-config.use_inductor_graph_partition=true", + ), + ( + {"compilation_config.use_inductor": False}, + "--compilation-config.use_inductor=false", + ), + ( + {"compilation_config.use_inductor": True}, + "--compilation-config.use_inductor=true", + ), + ], + ) + def test_nested_boolean_params(self, input_dict, expected): + """Test that nested boolean params use =true/false syntax.""" + item = ParameterSweepItem.from_record(input_dict) + cmd = item.apply_to_cmd(["vllm", "serve", "model"]) + assert expected in cmd + + @pytest.mark.parametrize( + "input_dict,expected", + [ + ({"enable_prefix_caching": False}, "--no-enable-prefix-caching"), + ({"enable_prefix_caching": True}, "--enable-prefix-caching"), + ({"disable_log_stats": False}, "--no-disable-log-stats"), + ({"disable_log_stats": True}, "--disable-log-stats"), + ], + ) + def test_non_nested_boolean_params(self, input_dict, expected): + """Test that non-nested boolean params use --no- prefix.""" + item = ParameterSweepItem.from_record(input_dict) + cmd = item.apply_to_cmd(["vllm", "serve", "model"]) + assert expected in cmd + + @pytest.mark.parametrize( + "compilation_config", + [ + {"cudagraph_mode": "full", "mode": 2, "use_inductor_graph_partition": True}, + { + "cudagraph_mode": "piecewise", + "mode": 3, + "use_inductor_graph_partition": False, + }, + ], + ) + def test_nested_dict_value(self, compilation_config): + """Test that nested dict values are serialized as JSON.""" + item = ParameterSweepItem.from_record( + {"compilation_config": compilation_config} + ) + cmd = item.apply_to_cmd(["vllm", "serve", "model"]) + assert "--compilation-config" in cmd + # The dict should be JSON serialized + idx = cmd.index("--compilation-config") + assert json.loads(cmd[idx + 1]) == compilation_config + + @pytest.mark.parametrize( + "input_dict,expected_key,expected_value", + [ + ({"model": "test-model"}, "--model", "test-model"), + ({"max_tokens": 100}, "--max-tokens", "100"), + ({"temperature": 0.7}, "--temperature", "0.7"), + ], + ) + def test_string_and_numeric_values(self, input_dict, expected_key, expected_value): + """Test that string and numeric values are handled correctly.""" + item = ParameterSweepItem.from_record(input_dict) + cmd = item.apply_to_cmd(["vllm", "serve"]) + assert expected_key in cmd + assert expected_value in cmd + + @pytest.mark.parametrize( + "input_dict,expected_key,key_idx_offset", + [ + ({"max_tokens": 200}, "--max-tokens", 1), + ({"enable_prefix_caching": False}, "--no-enable-prefix-caching", 0), + ], + ) + def test_replace_existing_parameter(self, input_dict, expected_key, key_idx_offset): + """Test that existing parameters in cmd are replaced.""" + item = ParameterSweepItem.from_record(input_dict) + + if key_idx_offset == 1: + # Key-value pair + cmd = item.apply_to_cmd(["vllm", "serve", "--max-tokens", "100", "model"]) + assert expected_key in cmd + idx = cmd.index(expected_key) + assert cmd[idx + 1] == "200" + assert "100" not in cmd + else: + # Boolean flag + cmd = item.apply_to_cmd( + ["vllm", "serve", "--enable-prefix-caching", "model"] + ) + assert expected_key in cmd + assert "--enable-prefix-caching" not in cmd + + +class TestParameterSweep: + """Test ParameterSweep functionality.""" + + def test_from_records_list(self): + """Test creating ParameterSweep from a list of records.""" + records = [ + {"max_tokens": 100, "temperature": 0.7}, + {"max_tokens": 200, "temperature": 0.9}, + ] + sweep = ParameterSweep.from_records(records) + assert len(sweep) == 2 + assert sweep[0]["max_tokens"] == 100 + assert sweep[1]["max_tokens"] == 200 + + def test_read_from_dict(self): + """Test creating ParameterSweep from a dict format.""" + data = { + "experiment1": {"max_tokens": 100, "temperature": 0.7}, + "experiment2": {"max_tokens": 200, "temperature": 0.9}, + } + sweep = ParameterSweep.read_from_dict(data) + assert len(sweep) == 2 + + # Check that items have the _benchmark_name field + names = {item["_benchmark_name"] for item in sweep} + assert names == {"experiment1", "experiment2"} + + # Check that parameters are preserved + for item in sweep: + if item["_benchmark_name"] == "experiment1": + assert item["max_tokens"] == 100 + assert item["temperature"] == 0.7 + elif item["_benchmark_name"] == "experiment2": + assert item["max_tokens"] == 200 + assert item["temperature"] == 0.9 + + def test_read_json_list_format(self): + """Test reading JSON file with list format.""" + records = [ + {"max_tokens": 100, "temperature": 0.7}, + {"max_tokens": 200, "temperature": 0.9}, + ] + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(records, f) + temp_path = Path(f.name) + + try: + sweep = ParameterSweep.read_json(temp_path) + assert len(sweep) == 2 + assert sweep[0]["max_tokens"] == 100 + assert sweep[1]["max_tokens"] == 200 + finally: + temp_path.unlink() + + def test_read_json_dict_format(self): + """Test reading JSON file with dict format.""" + data = { + "experiment1": {"max_tokens": 100, "temperature": 0.7}, + "experiment2": {"max_tokens": 200, "temperature": 0.9}, + } + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(data, f) + temp_path = Path(f.name) + + try: + sweep = ParameterSweep.read_json(temp_path) + assert len(sweep) == 2 + + # Check that items have the _benchmark_name field + names = {item["_benchmark_name"] for item in sweep} + assert names == {"experiment1", "experiment2"} + finally: + temp_path.unlink() + + def test_unique_benchmark_names_validation(self): + """Test that duplicate _benchmark_name values raise an error.""" + # Test with duplicate names in list format + records = [ + {"_benchmark_name": "exp1", "max_tokens": 100}, + {"_benchmark_name": "exp1", "max_tokens": 200}, + ] + + with pytest.raises(ValueError, match="Duplicate _benchmark_name values"): + ParameterSweep.from_records(records) + + def test_unique_benchmark_names_multiple_duplicates(self): + """Test validation with multiple duplicate names.""" + records = [ + {"_benchmark_name": "exp1", "max_tokens": 100}, + {"_benchmark_name": "exp1", "max_tokens": 200}, + {"_benchmark_name": "exp2", "max_tokens": 300}, + {"_benchmark_name": "exp2", "max_tokens": 400}, + ] + + with pytest.raises(ValueError, match="Duplicate _benchmark_name values"): + ParameterSweep.from_records(records) + + def test_no_benchmark_names_allowed(self): + """Test that records without _benchmark_name are allowed.""" + records = [ + {"max_tokens": 100, "temperature": 0.7}, + {"max_tokens": 200, "temperature": 0.9}, + ] + sweep = ParameterSweep.from_records(records) + assert len(sweep) == 2 + + def test_mixed_benchmark_names_allowed(self): + """Test that mixing records with and without _benchmark_name is allowed.""" + records = [ + {"_benchmark_name": "exp1", "max_tokens": 100}, + {"max_tokens": 200, "temperature": 0.9}, + ] + sweep = ParameterSweep.from_records(records) + assert len(sweep) == 2 + + +class TestParameterSweepItemKeyNormalization: + """Test key normalization in ParameterSweepItem.""" + + def test_underscore_to_hyphen_conversion(self): + """Test that underscores are converted to hyphens in CLI.""" + item = ParameterSweepItem.from_record({"max_tokens": 100}) + cmd = item.apply_to_cmd(["vllm", "serve"]) + assert "--max-tokens" in cmd + + def test_nested_key_preserves_suffix(self): + """Test that nested keys preserve the suffix format.""" + # The suffix after the dot should preserve underscores + item = ParameterSweepItem.from_record( + {"compilation_config.some_nested_param": "value"} + ) + cmd = item.apply_to_cmd(["vllm", "serve"]) + # The prefix (compilation_config) gets converted to hyphens, + # but the suffix (some_nested_param) is preserved + assert any("compilation-config.some_nested_param" in arg for arg in cmd) diff --git a/tests/benchmarks/test_plot_filters.py b/tests/benchmarks/test_plot_filters.py new file mode 100644 index 0000000000000..2b58a99125e6c --- /dev/null +++ b/tests/benchmarks/test_plot_filters.py @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pandas as pd +import pytest + +from vllm.benchmarks.sweep.plot import ( + PlotEqualTo, + PlotFilterBase, + PlotFilters, + PlotGreaterThan, + PlotGreaterThanOrEqualTo, + PlotLessThan, + PlotLessThanOrEqualTo, + PlotNotEqualTo, +) + + +class TestPlotFilters: + """Test PlotFilter functionality including 'inf' edge case.""" + + def setup_method(self): + """Create sample DataFrames for testing.""" + # DataFrame with numeric values + self.df_numeric = pd.DataFrame( + { + "request_rate": [1.0, 5.0, 10.0, 50.0, 100.0], + "value": [10, 20, 30, 40, 50], + } + ) + + # DataFrame with float('inf') - note: string "inf" values are coerced + # to float when loading data, so we only test with float('inf') + self.df_inf_float = pd.DataFrame( + { + "request_rate": [1.0, 5.0, 10.0, float("inf"), float("inf")], + "value": [10, 20, 30, 40, 50], + } + ) + + @pytest.mark.parametrize( + "target,expected_count", + [ + ("5.0", 1), + ("10.0", 1), + ("1.0", 1), + ], + ) + def test_equal_to_numeric(self, target, expected_count): + """Test PlotEqualTo with numeric values.""" + filter_obj = PlotEqualTo("request_rate", target) + result = filter_obj.apply(self.df_numeric) + assert len(result) == expected_count + + def test_equal_to_inf_float(self): + """Test PlotEqualTo with float('inf').""" + filter_obj = PlotEqualTo("request_rate", "inf") + result = filter_obj.apply(self.df_inf_float) + # Should match both float('inf') entries because float('inf') == float('inf') + assert len(result) == 2 + + @pytest.mark.parametrize( + "target,expected_count", + [ + ("5.0", 4), # All except 5.0 + ("1.0", 4), # All except 1.0 + ], + ) + def test_not_equal_to_numeric(self, target, expected_count): + """Test PlotNotEqualTo with numeric values.""" + filter_obj = PlotNotEqualTo("request_rate", target) + result = filter_obj.apply(self.df_numeric) + assert len(result) == expected_count + + def test_not_equal_to_inf_float(self): + """Test PlotNotEqualTo with float('inf').""" + filter_obj = PlotNotEqualTo("request_rate", "inf") + result = filter_obj.apply(self.df_inf_float) + # Should exclude float('inf') entries + assert len(result) == 3 + + @pytest.mark.parametrize( + "target,expected_count", + [ + ("10.0", 2), # 1.0, 5.0 + ("50.0", 3), # 1.0, 5.0, 10.0 + ("5.0", 1), # 1.0 + ], + ) + def test_less_than(self, target, expected_count): + """Test PlotLessThan with numeric values.""" + filter_obj = PlotLessThan("request_rate", target) + result = filter_obj.apply(self.df_numeric) + assert len(result) == expected_count + + @pytest.mark.parametrize( + "target,expected_count", + [ + ("10.0", 3), # 1.0, 5.0, 10.0 + ("5.0", 2), # 1.0, 5.0 + ], + ) + def test_less_than_or_equal_to(self, target, expected_count): + """Test PlotLessThanOrEqualTo with numeric values.""" + filter_obj = PlotLessThanOrEqualTo("request_rate", target) + result = filter_obj.apply(self.df_numeric) + assert len(result) == expected_count + + @pytest.mark.parametrize( + "target,expected_count", + [ + ("10.0", 2), # 50.0, 100.0 + ("5.0", 3), # 10.0, 50.0, 100.0 + ], + ) + def test_greater_than(self, target, expected_count): + """Test PlotGreaterThan with numeric values.""" + filter_obj = PlotGreaterThan("request_rate", target) + result = filter_obj.apply(self.df_numeric) + assert len(result) == expected_count + + @pytest.mark.parametrize( + "target,expected_count", + [ + ("10.0", 3), # 10.0, 50.0, 100.0 + ("5.0", 4), # 5.0, 10.0, 50.0, 100.0 + ], + ) + def test_greater_than_or_equal_to(self, target, expected_count): + """Test PlotGreaterThanOrEqualTo with numeric values.""" + filter_obj = PlotGreaterThanOrEqualTo("request_rate", target) + result = filter_obj.apply(self.df_numeric) + assert len(result) == expected_count + + @pytest.mark.parametrize( + "filter_str,expected_var,expected_target,expected_type", + [ + ("request_rate==5.0", "request_rate", "5.0", PlotEqualTo), + ("request_rate!=10.0", "request_rate", "10.0", PlotNotEqualTo), + ("request_rate<50.0", "request_rate", "50.0", PlotLessThan), + ("request_rate<=50.0", "request_rate", "50.0", PlotLessThanOrEqualTo), + ("request_rate>10.0", "request_rate", "10.0", PlotGreaterThan), + ("request_rate>=10.0", "request_rate", "10.0", PlotGreaterThanOrEqualTo), + ("request_rate==inf", "request_rate", "inf", PlotEqualTo), + ("request_rate!='inf'", "request_rate", "inf", PlotNotEqualTo), + ], + ) + def test_parse_str(self, filter_str, expected_var, expected_target, expected_type): + """Test parsing filter strings.""" + filter_obj = PlotFilterBase.parse_str(filter_str) + assert isinstance(filter_obj, expected_type) + assert filter_obj.var == expected_var + assert filter_obj.target == expected_target + + def test_parse_str_inf_edge_case(self): + """Test parsing 'inf' string in filter.""" + filter_obj = PlotFilterBase.parse_str("request_rate==inf") + assert isinstance(filter_obj, PlotEqualTo) + assert filter_obj.var == "request_rate" + assert filter_obj.target == "inf" + + def test_parse_multiple_filters(self): + """Test parsing multiple filters.""" + filters = PlotFilters.parse_str("request_rate>5.0,value<=40") + assert len(filters) == 2 + assert isinstance(filters[0], PlotGreaterThan) + assert isinstance(filters[1], PlotLessThanOrEqualTo) + + def test_parse_empty_filter(self): + """Test parsing empty filter string.""" + filters = PlotFilters.parse_str("") + assert len(filters) == 0 diff --git a/vllm/benchmarks/sweep/param_sweep.py b/vllm/benchmarks/sweep/param_sweep.py index 986561ed8502a..a438a328880fd 100644 --- a/vllm/benchmarks/sweep/param_sweep.py +++ b/vllm/benchmarks/sweep/param_sweep.py @@ -9,8 +9,26 @@ class ParameterSweep(list["ParameterSweepItem"]): @classmethod def read_json(cls, filepath: os.PathLike): with open(filepath, "rb") as f: - records = json.load(f) + data = json.load(f) + # Support both list and dict formats + if isinstance(data, dict): + return cls.read_from_dict(data) + + return cls.from_records(data) + + @classmethod + def read_from_dict(cls, data: dict[str, dict[str, object]]): + """ + Read parameter sweep from a dict format where keys are names. + + Example: + { + "experiment1": {"max_tokens": 100, "temperature": 0.7}, + "experiment2": {"max_tokens": 200, "temperature": 0.9} + } + """ + records = [{"_benchmark_name": name, **params} for name, params in data.items()] return cls.from_records(records) @classmethod @@ -21,6 +39,15 @@ class ParameterSweep(list["ParameterSweepItem"]): f"but found type: {type(records)}" ) + # Validate that all _benchmark_name values are unique if provided + names = [r["_benchmark_name"] for r in records if "_benchmark_name" in r] + if names and len(names) != len(set(names)): + duplicates = [name for name in names if names.count(name) > 1] + raise ValueError( + f"Duplicate _benchmark_name values found: {set(duplicates)}. " + f"All _benchmark_name values must be unique." + ) + return cls(ParameterSweepItem.from_record(record) for record in records) @@ -38,6 +65,18 @@ class ParameterSweepItem(dict[str, object]): def __or__(self, other: dict[str, Any]): return type(self)(super().__or__(other)) + @property + def name(self) -> str: + """ + Get the name for this parameter sweep item. + + Returns the '_benchmark_name' field if present, otherwise returns a text + representation of all parameters. + """ + if "_benchmark_name" in self: + return self["_benchmark_name"] + return self.as_text(sep="-") + # In JSON, we prefer "_" def _iter_param_key_candidates(self, param_key: str): # Inner config arguments are not converted by the CLI @@ -63,29 +102,57 @@ class ParameterSweepItem(dict[str, object]): def has_param(self, param_key: str) -> bool: return any(k in self for k in self._iter_param_key_candidates(param_key)) + def _normalize_cmd_kv_pair(self, k: str, v: object) -> list[str]: + """ + Normalize a key-value pair into command-line arguments. + + Returns a list containing either: + - A single element for boolean flags (e.g., ['--flag'] or ['--flag=true']) + - Two elements for key-value pairs (e.g., ['--key', 'value']) + """ + if isinstance(v, bool): + # For nested params (containing "."), use =true/false syntax + if "." in k: + return [f"{self._normalize_cmd_key(k)}={'true' if v else 'false'}"] + else: + return [self._normalize_cmd_key(k if v else "no-" + k)] + else: + return [self._normalize_cmd_key(k), str(v)] + def apply_to_cmd(self, cmd: list[str]) -> list[str]: cmd = list(cmd) for k, v in self.items(): + # Skip the '_benchmark_name' field, not a parameter + if k == "_benchmark_name": + continue + + # Serialize dict values as JSON + if isinstance(v, dict): + v = json.dumps(v) + for k_candidate in self._iter_cmd_key_candidates(k): try: k_idx = cmd.index(k_candidate) - if isinstance(v, bool): - cmd[k_idx] = self._normalize_cmd_key(k if v else "no-" + k) + # Replace existing parameter + normalized = self._normalize_cmd_kv_pair(k, v) + if len(normalized) == 1: + # Boolean flag + cmd[k_idx] = normalized[0] else: - cmd[k_idx + 1] = str(v) + # Key-value pair + cmd[k_idx] = normalized[0] + cmd[k_idx + 1] = normalized[1] break except ValueError: continue else: - if isinstance(v, bool): - cmd.append(self._normalize_cmd_key(k if v else "no-" + k)) - else: - cmd.extend([self._normalize_cmd_key(k), str(v)]) + # Add new parameter + cmd.extend(self._normalize_cmd_kv_pair(k, v)) return cmd def as_text(self, sep: str = ", ") -> str: - return sep.join(f"{k}={v}" for k, v in self.items()) + return sep.join(f"{k}={v}" for k, v in self.items() if k != "_benchmark_name") diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 9947d6170d891..163d517931342 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -65,6 +65,18 @@ class PlotEqualTo(PlotFilterBase): return df[df[self.var] == target] +@dataclass +class PlotNotEqualTo(PlotFilterBase): + @override + def apply(self, df: "pd.DataFrame") -> "pd.DataFrame": + try: + target = float(self.target) + except ValueError: + target = self.target + + return df[df[self.var] != target] + + @dataclass class PlotLessThan(PlotFilterBase): @override @@ -96,6 +108,7 @@ class PlotGreaterThanOrEqualTo(PlotFilterBase): # NOTE: The ordering is important! Match longer op_keys first PLOT_FILTERS: dict[str, type[PlotFilterBase]] = { "==": PlotEqualTo, + "!=": PlotNotEqualTo, "<=": PlotLessThanOrEqualTo, ">=": PlotGreaterThanOrEqualTo, "<": PlotLessThan, @@ -167,6 +180,27 @@ def _json_load_bytes(path: Path) -> list[dict[str, object]]: return json.load(f) +def _convert_inf_nan_strings(data: list[dict[str, object]]) -> list[dict[str, object]]: + """ + Convert string values "inf", "-inf", and "nan" to their float equivalents. + + This handles the case where JSON serialization represents inf/nan as strings. + """ + converted_data = [] + for record in data: + converted_record = {} + for key, value in record.items(): + if isinstance(value, str): + if value in ["inf", "-inf", "nan"]: + converted_record[key] = float(value) + else: + converted_record[key] = value + else: + converted_record[key] = value + converted_data.append(converted_record) + return converted_data + + def _get_metric(run_data: dict[str, object], metric_key: str): try: return run_data[metric_key] @@ -178,12 +212,15 @@ def _get_group(run_data: dict[str, object], group_keys: list[str]): return tuple((k, str(_get_metric(run_data, k))) for k in group_keys) -def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...]): +def _get_fig_path(fig_dir: Path, group: tuple[tuple[str, str], ...], fig_name: str): parts = list[str]() + + # Start with figure name (always provided, defaults to "FIGURE") + parts.append(fig_name) + + # Always append group data if present if group: - parts.extend(("FIGURE-", *(f"{k}={v}" for k, v in group))) - else: - parts.append("figure") + parts.extend(f"{k}={v}" for k, v in group) return fig_dir / sanitize_filename("-".join(parts) + ".png") @@ -217,6 +254,10 @@ def _plot_fig( scale_x: str | None, scale_y: str | None, dry_run: bool, + fig_name: str, + error_bars: bool, + fig_height: float, + fig_dpi: int, ): fig_group, fig_data = fig_group_data @@ -230,7 +271,7 @@ def _plot_fig( for _, row_data in row_groups ) - fig_path = _get_fig_path(fig_dir, fig_group) + fig_path = _get_fig_path(fig_dir, fig_group, fig_name) print("[BEGIN FIGURE]") print(f"Group: {dict(fig_group)}") @@ -241,6 +282,8 @@ def _plot_fig( print("[END FIGURE]") return + # Convert string "inf", "-inf", and "nan" to their float equivalents + fig_data = _convert_inf_nan_strings(fig_data) df = pd.DataFrame.from_records(fig_data) if var_x not in df.columns: @@ -275,6 +318,10 @@ def _plot_fig( df = filter_by.apply(df) df = bin_by.apply(df) + # Sort by curve_by columns alphabetically for consistent legend ordering + if curve_by: + df = df.sort_values(by=curve_by) + df["row_group"] = ( pd.concat( [k + "=" + df[k].astype(str) for k in row_by], @@ -293,7 +340,7 @@ def _plot_fig( else "(All)" ) - g = sns.FacetGrid(df, row="row_group", col="col_group") + g = sns.FacetGrid(df, row="row_group", col="col_group", height=fig_height) if row_by and col_by: g.set_titles("{row_name}\n{col_name}") @@ -320,6 +367,7 @@ def _plot_fig( style=style, size=size, markers=True, + errorbar="sd" if error_bars else None, ) g.add_legend(title=hue) @@ -339,11 +387,12 @@ def _plot_fig( y=var_y, hue="curve_group", markers=True, + errorbar="sd" if error_bars else None, ) g.add_legend() - g.savefig(fig_path) + g.savefig(fig_path, dpi=fig_dpi) plt.close(g.figure) print("[END FIGURE]") @@ -364,6 +413,10 @@ def plot( scale_x: str | None, scale_y: str | None, dry_run: bool, + fig_name: str = "FIGURE", + error_bars: bool = True, + fig_height: float = 6.4, + fig_dpi: int = 300, ): all_data = [ run_data @@ -398,6 +451,10 @@ def plot( scale_x=scale_x, scale_y=scale_y, dry_run=dry_run, + fig_name=fig_name, + error_bars=error_bars, + fig_height=fig_height, + fig_dpi=fig_dpi, ), fig_groups, ) @@ -419,6 +476,10 @@ class SweepPlotArgs: scale_x: str | None scale_y: str | None dry_run: bool + fig_name: str = "FIGURE" + error_bars: bool = True + fig_height: float = 6.4 + fig_dpi: int = 300 parser_name: ClassVar[str] = "plot" parser_help: ClassVar[str] = "Plot performance curves from parameter sweep results." @@ -448,6 +509,10 @@ class SweepPlotArgs: scale_x=args.scale_x, scale_y=args.scale_y, dry_run=args.dry_run, + fig_name=args.fig_name, + error_bars=not args.no_error_bars, + fig_height=args.fig_height, + fig_dpi=args.fig_dpi, ) @classmethod @@ -541,6 +606,32 @@ class SweepPlotArgs: "Currently only accepts string values such as 'log' and 'sqrt'. " "See also: https://seaborn.pydata.org/generated/seaborn.objects.Plot.scale.html", ) + parser.add_argument( + "--fig-name", + type=str, + default="FIGURE", + help="Name prefix for the output figure file. " + "Group data is always appended when present. " + "Default: 'FIGURE'. Example: --fig-name my_performance_plot", + ) + parser.add_argument( + "--no-error-bars", + action="store_true", + help="If set, disables error bars on the plot. " + "By default, error bars are shown.", + ) + parser.add_argument( + "--fig-height", + type=float, + default=6.4, + help="Height of each subplot in inches. Default: 6.4", + ) + parser.add_argument( + "--fig-dpi", + type=int, + default=300, + help="Resolution of the output figure in dots per inch. Default: 300", + ) parser.add_argument( "--dry-run", action="store_true", @@ -566,6 +657,10 @@ def run_main(args: SweepPlotArgs): scale_x=args.scale_x, scale_y=args.scale_y, dry_run=args.dry_run, + fig_name=args.fig_name, + error_bars=args.error_bars, + fig_height=args.fig_height, + fig_dpi=args.fig_dpi, ) diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py index 1298e4acbd87d..6626707cf2a52 100644 --- a/vllm/benchmarks/sweep/serve.py +++ b/vllm/benchmarks/sweep/serve.py @@ -138,9 +138,9 @@ def _get_comb_base_path( ): parts = list[str]() if serve_comb: - parts.extend(("SERVE-", serve_comb.as_text(sep="-"))) + parts.extend(("SERVE-", serve_comb.name)) if bench_comb: - parts.extend(("BENCH-", bench_comb.as_text(sep="-"))) + parts.extend(("BENCH-", bench_comb.name)) return output_dir / sanitize_filename("-".join(parts)) @@ -345,8 +345,9 @@ class SweepServeArgs: "--serve-params", type=str, default=None, - help="Path to JSON file containing a list of parameter combinations " - "for the `vllm serve` command. " + help="Path to JSON file containing parameter combinations " + "for the `vllm serve` command. Can be either a list of dicts or a dict " + "where keys are benchmark names. " "If both `serve_params` and `bench_params` are given, " "this script will iterate over their Cartesian product.", ) @@ -354,8 +355,9 @@ class SweepServeArgs: "--bench-params", type=str, default=None, - help="Path to JSON file containing a list of parameter combinations " - "for the `vllm bench serve` command. " + help="Path to JSON file containing parameter combinations " + "for the `vllm bench serve` command. Can be either a list of dicts or " + "a dict where keys are benchmark names. " "If both `serve_params` and `bench_params` are given, " "this script will iterate over their Cartesian product.", ) From afb1e5b380ff623e478d19a246b42b2903b9331f Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Tue, 2 Dec 2025 14:46:10 -0600 Subject: [PATCH 34/45] [CI][ROCm][tests/v1/e2e] Fix multiprocessing launch for the test (#29123) Signed-off-by: Divakar Verma <divakar.verma@amd.com> --- tests/v1/e2e/test_kv_sharing_fast_prefill.py | 22 +++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index 2778b0c5e5670..f895fb72e94a1 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -7,6 +7,7 @@ import pytest from vllm import LLM, SamplingParams from vllm.config import CompilationConfig, CompilationMode +from vllm.platforms import current_platform from ...utils import check_answers, fork_new_process_for_each_test, prep_prompts @@ -43,15 +44,26 @@ def test_prompts(): return prompts -@fork_new_process_for_each_test +use_fork_for_test = ( + fork_new_process_for_each_test if not current_platform.is_rocm() else lambda x: x +) + + +@use_fork_for_test @pytest.mark.parametrize("kv_sharing_fast_prefill", [False, True]) @pytest.mark.parametrize("enforce_eager", [True, False]) def test_kv_sharing_fast_prefill( monkeypatch: pytest.MonkeyPatch, kv_sharing_fast_prefill: bool, enforce_eager: bool, - test_prompts: list[str], ): + if not enforce_eager and current_platform.is_rocm(): + # Relevant context: https://github.com/vllm-project/vllm/pull/29244 + pytest.skip( + "ROCm: torch.compile produces incorrect output for gemma-3n's GELU " + "with tanh approximation. Use enforce_eager=True instead." + ) + sampling_params = SamplingParams(temperature=0.0, max_tokens=100) compilation_config = CompilationConfig( # This allows vLLM compilation backend to handle allocating and @@ -65,7 +77,11 @@ def test_kv_sharing_fast_prefill( with monkeypatch.context() as m: # Make scheduling deterministic for reproducibility - m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + if current_platform.is_rocm(): + # Use spawn to prevent cuda re-initialization error + m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + else: + m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") prompts, answer, indices = prep_prompts(batch_size) From 1528e079e2b2cf8a807e4dce86ef05540e16a430 Mon Sep 17 00:00:00 2001 From: jthomson04 <jwillthomson19@gmail.com> Date: Tue, 2 Dec 2025 13:25:52 -0800 Subject: [PATCH 35/45] [Perf] Avoid pageable HtoD transfer in MinTokensLogitsProcessor (#29826) Signed-off-by: jthomson04 <jwillthomson19@gmail.com> --- vllm/v1/sample/logits_processor/builtin.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py index 4ee7dc2880c8c..82743f72b0310 100644 --- a/vllm/v1/sample/logits_processor/builtin.py +++ b/vllm/v1/sample/logits_processor/builtin.py @@ -110,7 +110,7 @@ class MinPLogitsProcessor(LogitsProcessor): # Identify valid tokens using threshold comparison invalid_token_mask = probability_values < adjusted_min_p # Apply mask using boolean indexing - logits[invalid_token_mask] = -float("inf") + logits.masked_fill_(invalid_token_mask, -float("inf")) return logits @@ -178,6 +178,10 @@ class MinTokensLogitsProcessor(LogitsProcessor): self._device_tensor([], torch.int32), ) + self.neg_inf_tensor = torch.tensor( + -float("inf"), dtype=torch.float32, device=self.device + ) + def is_argmax_invariant(self) -> bool: """By censoring stop tokens, min-tokens can change the outcome of the argmax operation in greedy sampling.""" @@ -229,7 +233,7 @@ class MinTokensLogitsProcessor(LogitsProcessor): def apply(self, logits: torch.Tensor) -> torch.Tensor: if self.min_toks: # Inhibit EOS token for requests which have not reached min length - logits[self.logits_slice] = -float("inf") + logits.index_put_(self.logits_slice, self.neg_inf_tensor) return logits From 3ff5b53bc2330688ea85d72ae79fe84eed63547c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Dec 2025 21:29:32 +0000 Subject: [PATCH 36/45] Bump actions/setup-python from 6.0.0 to 6.1.0 (#29768) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/cleanup_pr_body.yml | 2 +- .github/workflows/pre-commit.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index 861290ea43c87..56fbe5ca704a1 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -16,7 +16,7 @@ jobs: uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - name: Set up Python - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: '3.12' diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index d5e70f30ef638..a03b979ad761d 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -17,7 +17,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: "3.12" - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" From 6fc5841db14efedae7e6a8d1abdde3516c6c35a1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Dec 2025 21:49:44 +0000 Subject: [PATCH 37/45] Fix some more Transformers nightly tests (#29872) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- examples/offline_inference/vision_language.py | 5 +++- .../vision_language_multi_image.py | 5 +++- tests/models/registry.py | 5 +++- vllm/model_executor/models/qwen2_vl.py | 9 ------ vllm/tokenizers/mistral.py | 28 +++++++++++++------ vllm/transformers_utils/config.py | 24 +++++++++++----- vllm/transformers_utils/configs/__init__.py | 2 ++ vllm/transformers_utils/configs/tarsier2.py | 24 ++++++++++++++++ 8 files changed, 75 insertions(+), 27 deletions(-) create mode 100644 vllm/transformers_utils/configs/tarsier2.py diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 8f72bf6f0b0d1..0888a9d60a3fa 100755 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1801,7 +1801,10 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: engine_args = EngineArgs( model=model_name, max_model_len=4096, - hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}, + hf_overrides={ + "architectures": ["Tarsier2ForConditionalGeneration"], + "model_type": "tarsier2", + }, limit_mm_per_prompt={modality: 1}, ) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 7ba4e64b567de..2193b1ca9cf48 100755 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1222,7 +1222,10 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData: trust_remote_code=True, max_model_len=32768, limit_mm_per_prompt={"image": len(image_urls)}, - hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}, + hf_overrides={ + "architectures": ["Tarsier2ForConditionalGeneration"], + "model_type": "tarsier2", + }, ) prompt = ( diff --git a/tests/models/registry.py b/tests/models/registry.py index 26351089fc464..6b1d24b1c99b5 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -831,7 +831,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), "Tarsier2ForConditionalGeneration": _HfExamplesInfo( "omni-research/Tarsier2-Recap-7b", - hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}, + hf_overrides={ + "architectures": ["Tarsier2ForConditionalGeneration"], + "model_type": "tarsier2", + }, ), "VoxtralForConditionalGeneration": _HfExamplesInfo( "mistralai/Voxtral-Mini-3B-2507", diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 8fbd896223944..b748768498412 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1576,15 +1576,6 @@ class Tarsier2ForConditionalGeneration(Qwen2VLForConditionalGeneration): } ) - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - # Tarsier2 uses llava as model_type, which will create a Qwen2VLConfig - # as text_config, we need to reconstruct Qwen2VLConfig from LlavaConfig. - config = vllm_config.model_config.hf_config - qwen2vl_config = config.text_config - qwen2vl_config.architectures = config.architectures - vllm_config.model_config.hf_config = qwen2vl_config - super().__init__(vllm_config=vllm_config, prefix=prefix) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: skip_prefixes = [] if self.visual is None: diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 96d1e78ce9f17..37d67607c2cfe 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -14,13 +14,19 @@ if TYPE_CHECKING: ) from mistral_common.tokens.tokenizers.tekken import Tekkenizer from transformers import BatchEncoding - from transformers.tokenization_mistral_common import ( - MistralCommonTokenizer as TransformersMistralTokenizer, - ) from vllm.entrypoints.chat_utils import ChatCompletionMessageParam from vllm.entrypoints.openai.protocol import ChatCompletionRequest + try: + # Transformers v5 + from transformers.tokenization_mistral_common import MistralCommonBackend + except ImportError: + # Transformers v4 + from transformers.tokenization_mistral_common import ( + MistralCommonTokenizer as MistralCommonBackend, + ) + logger = init_logger(__name__) @@ -208,11 +214,17 @@ class MistralTokenizer(TokenizerLike): **kwargs, ) -> "MistralTokenizer": from mistral_common.protocol.instruct.validator import ValidationMode - from transformers.tokenization_mistral_common import ( - MistralCommonTokenizer as TransformersMistralTokenizer, - ) - tokenizer = TransformersMistralTokenizer.from_pretrained( + try: + # Transformers v5 + from transformers.tokenization_mistral_common import MistralCommonBackend + except ImportError: + # Transformers v4 + from transformers.tokenization_mistral_common import ( + MistralCommonTokenizer as MistralCommonBackend, + ) + + tokenizer = MistralCommonBackend.from_pretrained( path_or_repo_id, *args, mode=ValidationMode.test, @@ -223,7 +235,7 @@ class MistralTokenizer(TokenizerLike): return cls(tokenizer) - def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None: + def __init__(self, tokenizer: "MistralCommonBackend") -> None: super().__init__() from mistral_common.protocol.instruct.validator import ValidationMode diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 0cceab90ba9a2..2911dcff2ab49 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -89,6 +89,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( step3_text="Step3TextConfig", qwen3_next="Qwen3NextConfig", lfm2_moe="Lfm2MoeConfig", + tarsier2="Tarsier2Config", ) _CONFIG_ATTRS_MAPPING: dict[str, str] = { @@ -127,6 +128,9 @@ class HFConfigParser(ConfigParserBase): if config_dict.get("speculators_config") is not None else model_type ) + # Allow hf_overrides to override model_type before checking _CONFIG_REGISTRY + if (hf_overrides := kwargs.pop("hf_overrides", None)) is not None: + model_type = hf_overrides.get("model_type", model_type) if model_type in _CONFIG_REGISTRY: config_class = _CONFIG_REGISTRY[model_type] @@ -310,7 +314,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: config.rope_parameters["rope_theta"] = rope_theta # No RoPE parameters to patch - if not hasattr(config, "rope_parameters"): + if getattr(config, "rope_parameters", None) is None: return # Add original_max_position_embeddings if present @@ -351,7 +355,10 @@ def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None: rope_parameters["rope_type"] = "longrope" logger.warning("Replacing legacy rope_type 'su' with 'longrope'") elif rope_parameters["rope_type"] == "mrope": - assert "mrope_section" in rope_parameters + if "mrope_section" not in rope_parameters: + raise ValueError( + "Legacy rope_type 'mrope' requires 'mrope_section' in rope_parameters" + ) rope_parameters["rope_type"] = "default" logger.warning("Replacing legacy rope_type 'mrope' with 'default'") @@ -584,6 +591,7 @@ def get_config( trust_remote_code=trust_remote_code, revision=revision, code_revision=code_revision, + hf_overrides=hf_overrides_kw, **kwargs, ) # Special architecture mapping check for GGUF models @@ -915,11 +923,13 @@ def get_hf_text_config(config: PretrainedConfig): """ text_config = config.get_text_config() - if text_config is not config: - # The code operates under the assumption that text_config should have - # `num_attention_heads` (among others). Assert here to fail early - # if transformers config doesn't align with this assumption. - assert hasattr(text_config, "num_attention_heads") + if text_config is not config and not hasattr(text_config, "num_attention_heads"): + raise ValueError( + "The text_config extracted from the model config does not have " + "`num_attention_heads` attribute. This indicates a mismatch " + "between the model config and vLLM's expectations. Please " + "ensure that the model config is compatible with vLLM." + ) return text_config diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 109f2b6986514..0e8d167886935 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -48,6 +48,7 @@ from vllm.transformers_utils.configs.step3_vl import ( Step3VisionEncoderConfig, Step3VLConfig, ) +from vllm.transformers_utils.configs.tarsier2 import Tarsier2Config from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ @@ -81,4 +82,5 @@ __all__ = [ "Step3VisionEncoderConfig", "Step3TextConfig", "Qwen3NextConfig", + "Tarsier2Config", ] diff --git a/vllm/transformers_utils/configs/tarsier2.py b/vllm/transformers_utils/configs/tarsier2.py new file mode 100644 index 0000000000000..12ebb4b7f602d --- /dev/null +++ b/vllm/transformers_utils/configs/tarsier2.py @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from transformers import Qwen2VLConfig + + +class Tarsier2Config(Qwen2VLConfig): + """ + Tarsier2's config.json is written such that AutoConfig.from_pretrained will create + a deeply nested config consisting of: + + - LlavaConfig + - Qwen2VLConfig + - Qwen2VLTextConfig + - Qwen2VLVisionConfig + - Qwen2VLConfig + - Qwen2VLTextConfig + - Qwen2VLVisionConfig + + When it should really just be a single Qwen2VLConfig. + + This class is a hack to stop AutoConfig from creating the nested config structure. + """ + + model_type = "tarsier2" From e6f114ac25967b073954f7f3dc733672d173124c Mon Sep 17 00:00:00 2001 From: Sage Moore <sage@neuralmagic.com> Date: Tue, 2 Dec 2025 14:20:22 -0800 Subject: [PATCH 38/45] [Bugfix][EPLB] Prevent user-provided EPLB config from being overwritten with defaults (#29911) Signed-off-by: Sage Moore <sage@neuralmagic.com> --- tests/distributed/test_eplb_spec_decode.py | 16 +++++++++------- vllm/engine/arg_utils.py | 14 -------------- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py index c055b7a3f6dd7..868cc702866e2 100644 --- a/tests/distributed/test_eplb_spec_decode.py +++ b/tests/distributed/test_eplb_spec_decode.py @@ -22,7 +22,14 @@ def get_model_args( "num_speculative_tokens": 1, "max_model_len": model_max_len, } - + eplb_config = { + "num_redundant_experts": tp_size, + "window_size": 128, + "step_interval": 1024, + "log_balancedness": False, + } + if use_async: + eplb_config["use_async"] = True model_args = { "pretrained": model_name, "dtype": "auto", @@ -31,15 +38,10 @@ def get_model_args( "gpu_memory_utilization": 0.7, "speculative_config": speculative_config, "enable_expert_parallel": True, - "num_redundant_experts": tp_size, - "eplb_window_size": 128, - "eplb_step_interval": 1024, - "eplb_log_balancedness": False, + "eplb_config": eplb_config, "enable_eplb": True, "max_model_len": model_max_len, } - if use_async: - model_args["eplb_config"] = {"use_async": True} return model_args diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 83029e09ceaad..096217da4fe44 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -421,10 +421,6 @@ class EngineArgs: ) _api_process_count: int = ParallelConfig._api_process_count _api_process_rank: int = ParallelConfig._api_process_rank - num_redundant_experts: int = EPLBConfig.num_redundant_experts - eplb_window_size: int = EPLBConfig.window_size - eplb_step_interval: int = EPLBConfig.step_interval - eplb_log_balancedness: bool = EPLBConfig.log_balancedness max_parallel_loading_workers: int | None = ( ParallelConfig.max_parallel_loading_workers ) @@ -1582,16 +1578,6 @@ class EngineArgs: ) self.disable_nccl_for_dp_synchronization = True - # Forward the deprecated CLI args to the EPLB config. - if self.num_redundant_experts is not None: - self.eplb_config.num_redundant_experts = self.num_redundant_experts - if self.eplb_window_size is not None: - self.eplb_config.window_size = self.eplb_window_size - if self.eplb_step_interval is not None: - self.eplb_config.step_interval = self.eplb_step_interval - if self.eplb_log_balancedness is not None: - self.eplb_config.log_balancedness = self.eplb_log_balancedness - parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, From 0a9caca9f5e130acbf39d5acd0b79fb492d6c4a3 Mon Sep 17 00:00:00 2001 From: Chauncey <chaunceyjiang@gmail.com> Date: Wed, 3 Dec 2025 06:42:28 +0800 Subject: [PATCH 39/45] [Bugfix] fix --scheduling-policy=priority & n>1 crashes engine (#29764) Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com> Signed-off-by: Nick Hill <nhill@redhat.com> Co-authored-by: Nick Hill <nhill@redhat.com> --- .../v1/core/test_priority_scheduler_random.py | 12 +++++++++- vllm/v1/core/sched/request_queue.py | 24 ++++++++----------- vllm/v1/request.py | 13 ++++++++++ 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py index b4805be802723..429b179b61dce 100644 --- a/tests/v1/core/test_priority_scheduler_random.py +++ b/tests/v1/core/test_priority_scheduler_random.py @@ -219,7 +219,17 @@ def test_priority_scheduling_blast( vllm_config=scheduler.vllm_config, ) scheduler.add_request(req) - + num_initial_requests = 2 + for _ in range(num_initial_requests): + req = _create_random_request( + max_tokens_range=(1, max_output_tokens), + num_tokens_range=(1, max_input_tokens), + arrival_time_range=(0, 0), + priority_range=(4, 4), + num_mm_item_range=(0, 2), + vllm_config=scheduler.vllm_config, + ) + scheduler.add_request(req) for _ in range(20000): if len(scheduler.waiting) == 0: num_new_requests = random.randint(0, 2) diff --git a/vllm/v1/core/sched/request_queue.py b/vllm/v1/core/sched/request_queue.py index 7bc1010db23a2..a00ca1912b0f3 100644 --- a/vllm/v1/core/sched/request_queue.py +++ b/vllm/v1/core/sched/request_queue.py @@ -137,31 +137,30 @@ class PriorityRequestQueue(RequestQueue): """ A priority queue that supports heap operations. - Requests with a smaller value of `priority` are processed first. + Respects the ordering defined in the Request class, where + requests with a smaller value of `priority` are processed first. If multiple requests have the same priority, the one with the earlier `arrival_time` is processed first. """ def __init__(self) -> None: - self._heap: list[tuple[int, float, Request]] = [] + self._heap: list[Request] = [] def add_request(self, request: Request) -> None: """Add a request to the queue according to priority policy.""" - heapq.heappush(self._heap, (request.priority, request.arrival_time, request)) + heapq.heappush(self._heap, request) def pop_request(self) -> Request: """Pop a request from the queue according to priority policy.""" if not self._heap: raise IndexError("pop from empty heap") - _, _, request = heapq.heappop(self._heap) - return request + return heapq.heappop(self._heap) def peek_request(self) -> Request: """Peek at the next request in the queue without removing it.""" if not self._heap: raise IndexError("peek from empty heap") - _, _, request = self._heap[0] - return request + return self._heap[0] def prepend_request(self, request: Request) -> None: """Add a request to the queue according to priority policy. @@ -180,15 +179,13 @@ class PriorityRequestQueue(RequestQueue): def remove_request(self, request: Request) -> None: """Remove a specific request from the queue.""" - self._heap = [(p, t, r) for p, t, r in self._heap if r != request] + self._heap.remove(request) heapq.heapify(self._heap) def remove_requests(self, requests: Iterable[Request]) -> None: """Remove multiple specific requests from the queue.""" - requests_to_remove = set(requests) - self._heap = [ - (p, t, r) for p, t, r in self._heap if r not in requests_to_remove - ] + requests_to_remove = requests if isinstance(requests, set) else set(requests) + self._heap = [r for r in self._heap if r not in requests_to_remove] heapq.heapify(self._heap) def __bool__(self) -> bool: @@ -203,8 +200,7 @@ class PriorityRequestQueue(RequestQueue): """Iterate over the queue according to priority policy.""" heap_copy = self._heap[:] while heap_copy: - _, _, request = heapq.heappop(heap_copy) - yield request + yield heapq.heappop(heap_copy) def __reversed__(self) -> Iterator[Request]: """Iterate over the queue in reverse priority order.""" diff --git a/vllm/v1/request.py b/vllm/v1/request.py index f2dfd2eed03cd..33762fe34e64f 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -227,6 +227,19 @@ class Request: events, self.events = self.events, [] return events + def __lt__(self, other: "Request") -> bool: + """ + Compare two requests based on priority, arrival time, and request ID. + Used in priority scheduling. + """ + if self.priority != other.priority: + return self.priority < other.priority + if self.arrival_time != other.arrival_time: + return self.arrival_time < other.arrival_time + if self.request_id != other.request_id: + return self.request_id < other.request_id + return id(self) < id(other) + class RequestStatus(enum.IntEnum): """Status of a request.""" From 5e5646e2064f925f97ff533aa688a43834e9ff96 Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Tue, 2 Dec 2025 23:51:20 +0100 Subject: [PATCH 40/45] [BUGFIX] llama_4_scaling wrongly passed to DeepseekAttention (#29908) Signed-off-by: juliendenize <julien.denize@mistral.ai> --- vllm/model_executor/models/deepseek_v2.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index d8a081af125c5..a8eb4a69b6f2b 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -1135,6 +1135,8 @@ class DeepseekV2DecoderLayer(nn.Module): dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim) ) + self.use_mha = use_mha + if use_mha: attn_cls = DeepseekAttention elif model_config.use_mla: @@ -1196,11 +1198,14 @@ class DeepseekV2DecoderLayer(nn.Module): hidden_states = self.input_layernorm(hidden_states) else: hidden_states, residual = self.input_layernorm(hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - llama_4_scaling=llama_4_scaling, - ) + + attn_kwargs = { + "positions": positions, + "hidden_states": hidden_states, + } + if not self.use_mha: + attn_kwargs["llama_4_scaling"] = llama_4_scaling + hidden_states = self.self_attn(**attn_kwargs) if ( not isinstance(self.self_attn, DeepseekAttention) From 1b1e35aaf9d9561e1b5bf5b8e08b03565188e537 Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Tue, 2 Dec 2025 23:51:58 +0100 Subject: [PATCH 41/45] [BUGFIX] Fix regex pattern for Mistral Tool Call (#29918) Signed-off-by: juliendenize <julien.denize@mistral.ai> --- .../language/generation/test_mistral.py | 35 +++++++++++++++++++ .../tool_parsers/mistral_tool_parser.py | 2 +- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index 1377776a6d84b..e2d6271e2faed 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -315,3 +315,38 @@ def test_mistral_function_call_nested_json(): assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict # No additional content outside the tool call should be returned. assert parsed.content is None + + # multiple calls + multiple_args_dict = [ + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit", + "sub_dict": {"foo": "bar", "inner": {"x": 1, "y": 2}}, + }, + {}, + {"a": 0}, + {"a": 1, "b": "c"}, + ] + names = ["get_current_weather", "get_current_weather_2", "random", "random_2"] + + model_output = "".join( + [ + f"{parser.bot_token}{name}{json.dumps(args)}" + for name, args in zip(names, multiple_args_dict) + ] + ) + + parsed = parser.extract_tool_calls(model_output, None) + + # Assertions: the tool call is detected and the full nested JSON is parsed + # without truncation. + assert parsed.tools_called + assert len(parsed.tool_calls) == len(multiple_args_dict) + + for i, tool_call in enumerate(parsed.tool_calls): + assert MistralToolCall.is_valid_id(tool_call.id) + assert tool_call.function.name == names[i] + assert json.loads(tool_call.function.arguments) == multiple_args_dict[i] + # No additional content outside the tool call should be returned. + assert parsed.content is None diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index 89b882d6c8475..b89db60545abd 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -80,7 +80,7 @@ class MistralToolParser(ToolParser): self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL) if _is_fn_name_regex_support(self.model_tokenizer): self.fn_name_regex = re.compile( - r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\})(?=\s*$|,|\s)?", re.DOTALL + r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL ) else: self.fn_name_regex = None From c014de1ec777554d2954655bd564493476d92061 Mon Sep 17 00:00:00 2001 From: Micah Williamson <micah.williamson@amd.com> Date: Tue, 2 Dec 2025 16:54:36 -0600 Subject: [PATCH 42/45] [ROCm][CI] Fix test_cudagraph_mode.py Failure For AMD CI (#29808) Signed-off-by: Micah Williamson <micah.williamson@amd.com> --- tests/v1/cudagraph/test_cudagraph_mode.py | 40 ++++++++--------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index 12621d493e549..b1895e83b8b37 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -100,32 +100,20 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte # test cudagraph_mode with different compilation mode. # (backend_name, cudagraph_mode, compilation_mode, supported) -if current_platform.is_rocm(): - combo_cases_2 = [ - ("RocmAttn", "FULL", CompilationMode.NONE, True), - ("RocmAttn", "FULL", CompilationMode.VLLM_COMPILE, True), - ("RocmAttn", "PIECEWISE", CompilationMode.NONE, False), - ("RocmAttn", "PIECEWISE", CompilationMode.VLLM_COMPILE, True), - ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.NONE, False), - ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True), - ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.NONE, True), - ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True), - ("RocmAttn", "NONE", CompilationMode.NONE, True), - ("RocmAttn", "NONE", CompilationMode.VLLM_COMPILE, True), - ] -else: - combo_cases_2 = [ - ("FA2", "FULL", CompilationMode.NONE, True), - ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True), - ("FA2", "PIECEWISE", CompilationMode.NONE, True), - ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True), - ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, True), - ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True), - ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True), - ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True), - ("FA2", "NONE", CompilationMode.NONE, True), - ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True), - ] +attn_backend = "RocmAttn" if current_platform.is_rocm() else "FA2" + +combo_cases_2 = [ + (attn_backend, "FULL", CompilationMode.NONE, True), + (attn_backend, "FULL", CompilationMode.VLLM_COMPILE, True), + (attn_backend, "PIECEWISE", CompilationMode.NONE, True), + (attn_backend, "PIECEWISE", CompilationMode.VLLM_COMPILE, True), + (attn_backend, "FULL_AND_PIECEWISE", CompilationMode.NONE, True), + (attn_backend, "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True), + (attn_backend, "FULL_DECODE_ONLY", CompilationMode.NONE, True), + (attn_backend, "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True), + (attn_backend, "NONE", CompilationMode.NONE, True), + (attn_backend, "NONE", CompilationMode.VLLM_COMPILE, True), +] @pytest.mark.parametrize( From 5d91d2b292be9b1d6b121d36d242d5077a031e4b Mon Sep 17 00:00:00 2001 From: maang-h <55082429+maang-h@users.noreply.github.com> Date: Wed, 3 Dec 2025 07:23:09 +0800 Subject: [PATCH 43/45] [Doc] Add allocate_slots parameter docs (#29777) Signed-off-by: maang <maang_h@163.com> Signed-off-by: maang-h <55082429+maang-h@users.noreply.github.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com> --- vllm/v1/core/kv_cache_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 3823384881cd3..33e8c81514c5f 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -230,6 +230,9 @@ class KVCacheManager: delay_cache_blocks: Whether to skip caching the blocks. This is used by P/D when allocating blocks used in a KV transfer which will complete in a future step. + num_encoder_tokens: The number of encoder tokens to allocate for + cross-attention in encoder-decoder models(e.g., Whisper). + For decoder-only models, this should be 0. Blocks layout: ``` From 5f67361fd12851bfe8faad4cc173ca24565611e4 Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Tue, 2 Dec 2025 18:40:02 -0600 Subject: [PATCH 44/45] Reverting re-direction to amd_mi355_X. (#29914) Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com> --- .buildkite/test-amd.yaml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 67088caa8150b..ee4fdebae5675 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -51,7 +51,7 @@ steps: - label: Async Engine, Inputs, Utils, Worker Test # 10min timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 + agent_pool: mi325_1 grade: Blocking source_file_dependencies: - vllm/ @@ -64,7 +64,7 @@ steps: - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 + agent_pool: mi325_1 grade: Blocking source_file_dependencies: - vllm/ @@ -99,7 +99,7 @@ steps: - label: Basic Correctness Test # 20min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 + agent_pool: mi325_1 # grade: Blocking fast_check: true torch_nightly: true @@ -116,7 +116,7 @@ steps: - label: Entrypoints Unit Tests # 5min mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 + agent_pool: mi325_1 grade: Blocking timeout_in_minutes: 10 working_dir: "/vllm-workspace/tests" @@ -131,7 +131,7 @@ steps: - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 + agent_pool: mi325_1 # grade: Blocking working_dir: "/vllm-workspace/tests" fast_check: true @@ -254,7 +254,7 @@ steps: - label: EPLB Algorithm Test # 5min mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 + agent_pool: mi325_1 grade: Blocking timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" @@ -266,7 +266,7 @@ steps: - label: EPLB Execution Test # 10min mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_4 + agent_pool: mi325_4 # grade: Blocking timeout_in_minutes: 20 working_dir: "/vllm-workspace/tests" @@ -281,7 +281,7 @@ steps: - label: Metrics, Tracing Test # 12min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_2 + agent_pool: mi325_2 # grade: Blocking num_gpus: 2 source_file_dependencies: @@ -301,7 +301,7 @@ steps: - label: Regression Test # 7min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 + agent_pool: mi325_1 grade: Blocking source_file_dependencies: - vllm/ @@ -343,7 +343,7 @@ steps: - label: V1 Test entrypoints # 35min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 + agent_pool: mi325_1 grade: Blocking source_file_dependencies: - vllm/ @@ -544,7 +544,7 @@ steps: - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 + agent_pool: mi325_1 # grade: Blocking torch_nightly: true source_file_dependencies: From 5cdd66450910589c8e1a3d25e80711b0b6e51eb1 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Date: Tue, 2 Dec 2025 19:56:54 -0500 Subject: [PATCH 45/45] [BugFix] Fix assert in `build_for_cudagraph_capture` (#29893) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ee28f477a26ad..8c22ada029b1a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4000,7 +4000,7 @@ class GPUModelRunner( num_reqs=num_reqs_padded, max_query_len=max_query_len, ubatch_slices=ubatch_slices, - for_cudagraph_capture=True, + for_cudagraph_capture=is_graph_capturing, ) with self.maybe_dummy_run_with_lora(