diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 736fd12357034..a3d4ec4e64a3d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1230,6 +1230,48 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                                                 device="cpu",
                                                 dtype=torch.int32)
         return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
+
+    def pad_ubatch(self, target_num_tokens, ubatch_slice: UbatchSlice):
+        pass
+
+    def get_dp_padding_ubatch(self,
+                       ubatch_slices: UBatchSlices) -> tuple[int, Optional[torch.Tensor]]:
+        dp_size = self.vllm_config.parallel_config.data_parallel_size
+        dp_rank = self.vllm_config.parallel_config.data_parallel_rank
+
+        first_ubatch_slice = ubatch_slices[0]
+        second_ubatch_slice = ubatch_slices[1]
+
+        first_ubatch_num_tokens = first_ubatch_slice[1].stop - first_ubatch_slice[1].start
+        second_ubatch_num_tokens = second_ubatch_slice[1].stop - second_ubatch_slice[1].start
+
+        num_tokens = max(first_ubatch_num_tokens, second_ubatch_num_tokens)
+
+        # For DP: Don't pad when setting enforce_eager.
+        # This lets us set enforce_eager on the prefiller in a P/D setup and
+        # still use CUDA graphs (enabled by this padding) on the decoder.
+        #
+        # TODO(tms) : There are many cases where padding is enabled for
+        # prefills, causing unnecessary and excessive padding of activations.
+
+        if dp_size == 1:
+            # Early exit.
+            return 0, None
+
+        num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
+            num_tokens, dp_size, dp_rank)
+        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
+        num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] *
+                                                dp_size,
+                                                device="cpu",
+                                                dtype=torch.int32)
+        # Note that this num_pad_tokens will actually 
+        # be the number of tokens added to each ubatch. 
+        # Meaning 2*num_pad_tokens are added to each DP rank
+        num_pad_tokens = max_tokens_across_dp_cpu - num_tokens
+        self.pad_ubatch(num_pad_tokens, first_ubatch_slice)
+        self.pad_ubatch(num_pad_tokens, second_ubatch_slice)
+        return num_pad_tokens, num_tokens_after_padding
     
     def should_ubatch(self, should_ubatch: bool) -> bool:
         dp_size = self.vllm_config.parallel_config.data_parallel_size
@@ -1471,6 +1513,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             self._prepare_inputs(scheduler_output))
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
 
+        num_pad_tokens, num_tokens_after_padding = \
+            self.get_dp_padding_ubatch(ubatch_slices)
+        num_scheduled_tokens += num_pad_tokens
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
         self.maybe_setup_kv_connector(scheduler_output)