diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
index 8bf957368f6ab..16a7098749f8e 100644
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
@@ -105,6 +105,7 @@ class CUDAPiecewiseBackend:
             end_monitoring_torch_compile(self.vllm_config)
 
     def __call__(self, *args) -> Any:
+        # logger.info("CUDA BACKEND CALL")
         if not self.first_run_finished:
             self.first_run_finished = True
             self.check_for_ending_compilation()
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 05e4ca9f08b36..0474db0820c73 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -170,6 +170,7 @@ def _support_torch_compile(
         # e.g. TPU has the compilation logic in model runner, so we don't
         # need to compile the model inside.
         if self.do_not_compile or torch.compiler.is_compiling():
+            logger.info("SKIPPING COMPILATION")
             return self.forward(*args, **kwargs)
 
         # the first compilation needs to have dynamic shapes marked
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index d823706b63e03..cd5ec2f4cbf2d 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -134,14 +134,14 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 do_recv=not send,
             )
 
-        # yield_and_switch_from_compute_to_comm_impl(schedule="default")
+        yield_and_switch_from_compute_to_comm_impl(schedule="default")
         dispatch(True)  # Send
         # torch.cuda.synchronize()
         # print(f"{ubatch_id} AFTER SEND SYNC", flush=True)
         dispatch(False)  # Recv
         # torch.cuda.synchronize()
         # print(f"{ubatch_id} AFTER RECV SYNC", flush=True)
-        # yield_and_switch_from_comm_to_compute_impl(schedule="default")
+        yield_and_switch_from_comm_to_compute_impl(schedule="default")
         # torch.cuda.synchronize()
         if expert_x_scale is not None:
             expert_x_scale = expert_x_scale[:, :, 0:1]
@@ -185,11 +185,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 do_recv=not send,
             )
 
-        # yield_and_switch_from_compute_to_comm_impl(schedule="default")
+        yield_and_switch_from_compute_to_comm_impl(schedule="default")
         combine(True)
         # torch.cuda.synchronize()
         # print(f"{ubatch_id} AFTER COMBINE SEND SYNC", flush=True)
         combine(False)
         # print(f"{ubatch_id} AFTER COMBINE RECV SYNC", flush=True)
-        # yield_and_switch_from_comm_to_compute_impl(schedule="default")
+        yield_and_switch_from_comm_to_compute_impl(schedule="default")
         # torch.cuda.synchronize()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c3a9f96f57f61..ebec98f7219c4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -206,6 +206,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         self.use_cuda_graph = (self.vllm_config.compilation_config.level
                                == CompilationLevel.PIECEWISE
                                and not self.model_config.enforce_eager)
+        logger.info(f"self.use_cuda_graph {self.use_cuda_graph}")
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
         # The convention is different.
         # self.cudagraph_batch_sizes sorts in ascending order.
@@ -702,7 +703,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 if num_pad_tokens < scheduler_output.total_num_scheduled_tokens:
                     self.pad_out_ubatch_first_stage(ubatch_slices, num_pad_tokens)
                 else:
-                    assert False
                     # We bail out of ubatching here. This accounts for the case where 
                     # the padding would result in an "empty" second ubatch.
                     # TODO: just make the second ubatch a dummy ubatch
@@ -711,13 +711,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         
         # Note that if we are attempting to ubatch by this point then we know that no 
         # DP ranks are doing dummy runs
-        # if ubatch_slices:
-        #     should_ubatch = self.should_ubatch(False if ubatch_bailout else True)
-        #     if not should_ubatch:
-        #         logger.info("SUCCESSFULLY BAILED OUT")
-        #         num_pad_tokens = 0
-        #         num_tokens_after_padding = None
-        #         ubatch_slices = None
+        if ubatch_slices:
+            should_ubatch = self.should_ubatch(False if ubatch_bailout else True)
+            if not should_ubatch:
+                logger.info("SUCCESSFULLY BAILED OUT")
+                num_pad_tokens = 0
+                num_tokens_after_padding = None
+                ubatch_slices = None
 
         
         # This AR is only necessary in the case described above where 
@@ -2150,6 +2150,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # For profiling runs we dont want microbatching but for
         # dp dummy runs we do.
         allow_microbatching: bool = False,
+    # Maybe return a cudagraph here
     ) -> torch.Tensor:
 
         should_microbatch = False
@@ -2402,19 +2403,24 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         start_time = time.perf_counter()
         start_free_gpu_memory = torch.cuda.mem_get_info()[0]
 
+        logger.info("CAPTURE MODEL START")
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes
         # can reuse the memory pool allocated for the large shapes.
         with graph_capture(device=self.device):
             skip_attn = not self.vllm_config.compilation_config.full_cuda_graph
+            allow_microbatching = self.parallel_config.enable_microbatching
             for num_tokens in reversed(self.cudagraph_batch_sizes):
                 for _ in range(self.vllm_config.compilation_config.
                                cudagraph_num_of_warmups):
-                    self._dummy_run(num_tokens, skip_attn=skip_attn)
+                    self._dummy_run(num_tokens, skip_attn=skip_attn, 
+                                    allow_microbatching=allow_microbatching)
                 # print("CUDAGRAPH CAPTURE START")
-                self._dummy_run(num_tokens, skip_attn=skip_attn)
+                self._dummy_run(num_tokens, skip_attn=skip_attn, 
+                                allow_microbatching=allow_microbatching)
                 # print("CUDAGRAPH CAPTURE END")
 
+        logger.info("CAPTURE MODEL END")
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]
         elapsed_time = end_time - start_time