misc changes

Signed-off-by: Sage Moore <sage@neuralmagic.com>
2026-05-26 17:21:22 +08:00 · 2025-06-17 13:34:46 +00:00 · 2025-06-17 13:34:46 +00:00 · 1d112d90a5
commit 1d112d90a5
parent b74c731342
4 changed files with 22 additions and 14 deletions
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
@ -105,6 +105,7 @@ class CUDAPiecewiseBackend:
            end_monitoring_torch_compile(self.vllm_config)

    def __call__(self, *args) -> Any:
+        # logger.info("CUDA BACKEND CALL")
        if not self.first_run_finished:
            self.first_run_finished = True
            self.check_for_ending_compilation()
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@ -170,6 +170,7 @@ def _support_torch_compile(
        # e.g. TPU has the compilation logic in model runner, so we don't
        # need to compile the model inside.
        if self.do_not_compile or torch.compiler.is_compiling():
+            logger.info("SKIPPING COMPILATION")
            return self.forward(*args, **kwargs)

        # the first compilation needs to have dynamic shapes marked
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@ -134,14 +134,14 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                do_recv=not send,
            )

-        # yield_and_switch_from_compute_to_comm_impl(schedule="default")
+        yield_and_switch_from_compute_to_comm_impl(schedule="default")
        dispatch(True)  # Send
        # torch.cuda.synchronize()
        # print(f"{ubatch_id} AFTER SEND SYNC", flush=True)
        dispatch(False)  # Recv
        # torch.cuda.synchronize()
        # print(f"{ubatch_id} AFTER RECV SYNC", flush=True)
-        # yield_and_switch_from_comm_to_compute_impl(schedule="default")
+        yield_and_switch_from_comm_to_compute_impl(schedule="default")
        # torch.cuda.synchronize()
        if expert_x_scale is not None:
            expert_x_scale = expert_x_scale[:, :, 0:1]
@ -185,11 +185,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                do_recv=not send,
            )

-        # yield_and_switch_from_compute_to_comm_impl(schedule="default")
+        yield_and_switch_from_compute_to_comm_impl(schedule="default")
        combine(True)
        # torch.cuda.synchronize()
        # print(f"{ubatch_id} AFTER COMBINE SEND SYNC", flush=True)
        combine(False)
        # print(f"{ubatch_id} AFTER COMBINE RECV SYNC", flush=True)
-        # yield_and_switch_from_comm_to_compute_impl(schedule="default")
+        yield_and_switch_from_comm_to_compute_impl(schedule="default")
        # torch.cuda.synchronize()
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -206,6 +206,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        self.use_cuda_graph = (self.vllm_config.compilation_config.level
                               == CompilationLevel.PIECEWISE
                               and not self.model_config.enforce_eager)
+        logger.info(f"self.use_cuda_graph {self.use_cuda_graph}")
        # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
        # The convention is different.
        # self.cudagraph_batch_sizes sorts in ascending order.
@ -702,7 +703,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                if num_pad_tokens < scheduler_output.total_num_scheduled_tokens:
                    self.pad_out_ubatch_first_stage(ubatch_slices, num_pad_tokens)
                else:
-                    assert False
                    # We bail out of ubatching here. This accounts for the case where 
                    # the padding would result in an "empty" second ubatch.
                    # TODO: just make the second ubatch a dummy ubatch
@ -711,13 +711,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        
        # Note that if we are attempting to ubatch by this point then we know that no 
        # DP ranks are doing dummy runs
-        # if ubatch_slices:
-        #     should_ubatch = self.should_ubatch(False if ubatch_bailout else True)
-        #     if not should_ubatch:
-        #         logger.info("SUCCESSFULLY BAILED OUT")
-        #         num_pad_tokens = 0
-        #         num_tokens_after_padding = None
-        #         ubatch_slices = None
+        if ubatch_slices:
+            should_ubatch = self.should_ubatch(False if ubatch_bailout else True)
+            if not should_ubatch:
+                logger.info("SUCCESSFULLY BAILED OUT")
+                num_pad_tokens = 0
+                num_tokens_after_padding = None
+                ubatch_slices = None

        
        # This AR is only necessary in the case described above where 
@ -2150,6 +2150,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        # For profiling runs we dont want microbatching but for
        # dp dummy runs we do.
        allow_microbatching: bool = False,
+    # Maybe return a cudagraph here
    ) -> torch.Tensor:

        should_microbatch = False
@ -2402,19 +2403,24 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        start_time = time.perf_counter()
        start_free_gpu_memory = torch.cuda.mem_get_info()[0]

+        logger.info("CAPTURE MODEL START")
        # Trigger CUDA graph capture for specific shapes.
        # Capture the large shapes first so that the smaller shapes
        # can reuse the memory pool allocated for the large shapes.
        with graph_capture(device=self.device):
            skip_attn = not self.vllm_config.compilation_config.full_cuda_graph
+            allow_microbatching = self.parallel_config.enable_microbatching
            for num_tokens in reversed(self.cudagraph_batch_sizes):
                for _ in range(self.vllm_config.compilation_config.
                               cudagraph_num_of_warmups):
-                    self._dummy_run(num_tokens, skip_attn=skip_attn)
+                    self._dummy_run(num_tokens, skip_attn=skip_attn, 
+                                    allow_microbatching=allow_microbatching)
                # print("CUDAGRAPH CAPTURE START")
-                self._dummy_run(num_tokens, skip_attn=skip_attn)
+                self._dummy_run(num_tokens, skip_attn=skip_attn, 
+                                allow_microbatching=allow_microbatching)
                # print("CUDAGRAPH CAPTURE END")

+        logger.info("CAPTURE MODEL END")
        end_time = time.perf_counter()
        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
        elapsed_time = end_time - start_time