From 4819bb87151035a0fa95d783138fb60252fb0b4b Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 5 Aug 2025 18:01:25 +0000
Subject: [PATCH] fix eager mode

Signed-off-by: Sage Moore <sage@neuralmagic.com>
---
 vllm/v1/worker/gpu_model_runner.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 360d24f39a1ae..93b40c6241b73 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -250,12 +250,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             is_spec_decode=bool(self.vllm_config.speculative_config),
         )
 
+        can_use_cudagraphs = (self.vllm_config.compilation_config.level 
+                              == CompilationLevel.PIECEWISE 
+                              or self.compilation_config.full_cuda_graph)
         self.use_cuda_graph = (
-            self.vllm_config.compilation_config.level
-            == CompilationLevel.PIECEWISE
+            can_use_cudagraphs
             and self.vllm_config.compilation_config.use_cudagraph
             and not self.model_config.enforce_eager)
-        self.use_cuda_graph = True
+        # self.use_cuda_graph = True
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
         # The convention is different.
         # self.cudagraph_batch_sizes sorts in ascending order.
@@ -654,7 +656,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         num_tokens_after_padding = None
         ubatch_abort = False
         num_pad_tokens, num_tokens_after_padding = self.get_dp_padding_ubatch(
-            ubatch_slices, True)
+            ubatch_slices)
         if num_pad_tokens > 0:
             # Check if the padding would result in an empty second ubatch.
             # If so abort ubatching
@@ -1546,9 +1548,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         num_tokens_padded = num_tokens_unpadded
 
-        # if (self.use_cuda_graph
-        #         and num_tokens_unpadded <= self.cudagraph_batch_sizes[-1]):
-        if False:
+        if (self.use_cuda_graph and not self.parallel_config.enable_microbatching
+                and num_tokens_unpadded <= self.cudagraph_batch_sizes[-1]):
+        # if False:
             # Use piecewise CUDA graphs.
             # Add padding to the batch size.
             num_tokens_padded = self.vllm_config.pad_for_cudagraph(
@@ -1571,8 +1573,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
     def get_dp_padding_ubatch(
             self,
-            ubatch_slices: UBatchSlices,
-            include_cudagraphs: bool) -> tuple[int, Optional[torch.Tensor]]:
+            ubatch_slices: UBatchSlices) -> tuple[int, Optional[torch.Tensor]]:
         dp_size = self.vllm_config.parallel_config.data_parallel_size
 
         if dp_size == 1:
@@ -1592,7 +1593,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
 
         num_tokens_unpadded = first_ubatch_num_tokens + second_ubatch_num_tokens
         num_tokens_padded = round_up(num_tokens_unpadded, 2)
-        if (include_cudagraphs and self.use_cuda_graph
+        if (self.full_cuda_graph
                 and num_tokens_unpadded <= self.cudagraph_batch_sizes[-1]):
             # Add padding to the batch size.
             num_tokens_padded = self.vllm_config.pad_for_cudagraph(num_tokens_unpadded)
@@ -3056,7 +3057,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                 self._dummy_run(num_tokens,
                                 capture_attn_cudagraph=full_cg,
                                 allow_microbatching=allow_microbatching,
-                                build_cuda_graph=True,
+                                build_cuda_graph=full_cg,
                                 skip_eplb=True)
 
         end_time = time.perf_counter()