From 52a7d91980012e4e476e2ade43bbcce8ed7a91bd Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 3 Oct 2025 13:25:00 -0400 Subject: [PATCH] debug Signed-off-by: Robert Shaw --- vllm/v1/worker/gpu_model_runner.py | 2 ++ vllm/v1/worker/ubatch_splitting.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ff95acf0c016..b6b16968523c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2400,6 +2400,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Run the model. # Use persistent buffers for CUDA graphs. + logger.info(f"====== EXECUTE {ubatch_slices=}, {num_input_tokens=}, {num_tokens_across_dp=}") with (set_forward_context( attn_metadata, self.vllm_config, @@ -3046,6 +3047,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): (1 token) and prefill (multiple tokens) requests. remove_lora: If False, dummy LoRAs are not destroyed after the run """ + logger.info("====== DUMMY RUN") assert cudagraph_runtime_mode is None or \ cudagraph_runtime_mode.valid_runtime_modes() diff --git a/vllm/v1/worker/ubatch_splitting.py b/vllm/v1/worker/ubatch_splitting.py index 7767750aa604..49fe4e6c43d8 100644 --- a/vllm/v1/worker/ubatch_splitting.py +++ b/vllm/v1/worker/ubatch_splitting.py @@ -167,6 +167,7 @@ def ubatch_split( num_tokens_unpadded, uniform_decode=uniform_decode, ) + logger.info(f"==== {should_attempt_ubatching=}, {num_tokens_unpadded=}") # Don't microbatch unless every other DP worker is also microbatching should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch( @@ -175,6 +176,7 @@ def ubatch_split( should_attempt_ubatching, vllm_config, ) + logger.info(f"==== {should_ubatch=}, {num_tokens_after_padding=}") if not should_ubatch: return (None, None)