From 21e39436c8062ebbf4a160eebf56d7d303896e68 Mon Sep 17 00:00:00 2001 From: Calvin Chen Date: Mon, 18 Aug 2025 05:45:42 +0800 Subject: [PATCH 1/3] [XPU] fix xpu to set cudagraph batch sizes (#23044) Signed-off-by: calvin chen --- vllm/v1/worker/gpu_model_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4219d9147ada2..adaa1306f6ca4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -232,8 +232,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # The convention is different. # self.cudagraph_batch_sizes sorts in ascending order. # The batch sizes in the config are in descending order. - self.cudagraph_batch_sizes = list( - reversed(self.compilation_config.cudagraph_capture_sizes)) + if self.compilation_config.cudagraph_capture_sizes and \ + self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: + self.cudagraph_batch_sizes = list( + reversed(self.compilation_config.cudagraph_capture_sizes)) # Cache the device properties. self._init_device_properties() From 0fc8fa751a4321d6531467537ff77cf3c1c70260 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sun, 17 Aug 2025 15:56:07 -0700 Subject: [PATCH 2/3] fix: gptq marlin weight loading failure (#23066) --- vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index bd14ab9ef6c69..c5d1e017014f3 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -56,7 +56,7 @@ def get_moe_quant_method( # Dynamic per module/layer rules may override base config override_config(cloned_config, prefix=prefix) - return moe_method_cls(cloned_config) + return moe_method_cls(cloned_config, layer.moe_config) return None From 8ea0c2753a273e24957ab4587c200a3254ebe970 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 17 Aug 2025 18:16:03 -0700 Subject: [PATCH 3/3] [Misc] Minor code cleanup for _get_prompt_logprobs_dict (#23064) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index adaa1306f6ca4..fc320be1c3bda 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1722,7 +1722,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Compute prompt logprobs if needed. prompt_logprobs_dict = self._get_prompt_logprobs_dict( hidden_states[:num_scheduled_tokens], - scheduler_output, + scheduler_output.num_scheduled_tokens, ) # Get the valid generated tokens. @@ -2064,7 +2064,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _get_prompt_logprobs_dict( self, hidden_states: torch.Tensor, - scheduler_output: "SchedulerOutput", + num_scheduled_tokens: dict[str, int], ) -> dict[str, Optional[LogprobsTensors]]: num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs if not num_prompt_logprobs_dict: @@ -2077,8 +2077,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # maintainable loop over optimal performance. completed_prefill_reqs = [] for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items(): - - num_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_tokens = num_scheduled_tokens[req_id] # Get metadata for this request. request = self.requests[req_id]