diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py index 8bf957368f6ab..16a7098749f8e 100644 --- a/vllm/compilation/cuda_piecewise_backend.py +++ b/vllm/compilation/cuda_piecewise_backend.py @@ -105,6 +105,7 @@ class CUDAPiecewiseBackend: end_monitoring_torch_compile(self.vllm_config) def __call__(self, *args) -> Any: + # logger.info("CUDA BACKEND CALL") if not self.first_run_finished: self.first_run_finished = True self.check_for_ending_compilation() diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 05e4ca9f08b36..0474db0820c73 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -170,6 +170,7 @@ def _support_torch_compile( # e.g. TPU has the compilation logic in model runner, so we don't # need to compile the model inside. if self.do_not_compile or torch.compiler.is_compiling(): + logger.info("SKIPPING COMPILATION") return self.forward(*args, **kwargs) # the first compilation needs to have dynamic shapes marked diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index d823706b63e03..cd5ec2f4cbf2d 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -134,14 +134,14 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): do_recv=not send, ) - # yield_and_switch_from_compute_to_comm_impl(schedule="default") + yield_and_switch_from_compute_to_comm_impl(schedule="default") dispatch(True) # Send # torch.cuda.synchronize() # print(f"{ubatch_id} AFTER SEND SYNC", flush=True) dispatch(False) # Recv # torch.cuda.synchronize() # print(f"{ubatch_id} AFTER RECV SYNC", flush=True) - # yield_and_switch_from_comm_to_compute_impl(schedule="default") + yield_and_switch_from_comm_to_compute_impl(schedule="default") # torch.cuda.synchronize() if expert_x_scale is not None: expert_x_scale = expert_x_scale[:, :, 0:1] @@ -185,11 +185,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): do_recv=not send, ) - # yield_and_switch_from_compute_to_comm_impl(schedule="default") + yield_and_switch_from_compute_to_comm_impl(schedule="default") combine(True) # torch.cuda.synchronize() # print(f"{ubatch_id} AFTER COMBINE SEND SYNC", flush=True) combine(False) # print(f"{ubatch_id} AFTER COMBINE RECV SYNC", flush=True) - # yield_and_switch_from_comm_to_compute_impl(schedule="default") + yield_and_switch_from_comm_to_compute_impl(schedule="default") # torch.cuda.synchronize() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c3a9f96f57f61..ebec98f7219c4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -206,6 +206,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.use_cuda_graph = (self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not self.model_config.enforce_eager) + logger.info(f"self.use_cuda_graph {self.use_cuda_graph}") # TODO(woosuk): Provide an option to tune the max cudagraph batch size. # The convention is different. # self.cudagraph_batch_sizes sorts in ascending order. @@ -702,7 +703,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): if num_pad_tokens < scheduler_output.total_num_scheduled_tokens: self.pad_out_ubatch_first_stage(ubatch_slices, num_pad_tokens) else: - assert False # We bail out of ubatching here. This accounts for the case where # the padding would result in an "empty" second ubatch. # TODO: just make the second ubatch a dummy ubatch @@ -711,13 +711,13 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Note that if we are attempting to ubatch by this point then we know that no # DP ranks are doing dummy runs - # if ubatch_slices: - # should_ubatch = self.should_ubatch(False if ubatch_bailout else True) - # if not should_ubatch: - # logger.info("SUCCESSFULLY BAILED OUT") - # num_pad_tokens = 0 - # num_tokens_after_padding = None - # ubatch_slices = None + if ubatch_slices: + should_ubatch = self.should_ubatch(False if ubatch_bailout else True) + if not should_ubatch: + logger.info("SUCCESSFULLY BAILED OUT") + num_pad_tokens = 0 + num_tokens_after_padding = None + ubatch_slices = None # This AR is only necessary in the case described above where @@ -2150,6 +2150,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # For profiling runs we dont want microbatching but for # dp dummy runs we do. allow_microbatching: bool = False, + # Maybe return a cudagraph here ) -> torch.Tensor: should_microbatch = False @@ -2402,19 +2403,24 @@ class GPUModelRunner(LoRAModelRunnerMixin): start_time = time.perf_counter() start_free_gpu_memory = torch.cuda.mem_get_info()[0] + logger.info("CAPTURE MODEL START") # Trigger CUDA graph capture for specific shapes. # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. with graph_capture(device=self.device): skip_attn = not self.vllm_config.compilation_config.full_cuda_graph + allow_microbatching = self.parallel_config.enable_microbatching for num_tokens in reversed(self.cudagraph_batch_sizes): for _ in range(self.vllm_config.compilation_config. cudagraph_num_of_warmups): - self._dummy_run(num_tokens, skip_attn=skip_attn) + self._dummy_run(num_tokens, skip_attn=skip_attn, + allow_microbatching=allow_microbatching) # print("CUDAGRAPH CAPTURE START") - self._dummy_run(num_tokens, skip_attn=skip_attn) + self._dummy_run(num_tokens, skip_attn=skip_attn, + allow_microbatching=allow_microbatching) # print("CUDAGRAPH CAPTURE END") + logger.info("CAPTURE MODEL END") end_time = time.perf_counter() end_free_gpu_memory = torch.cuda.mem_get_info()[0] elapsed_time = end_time - start_time