From 908e9f8f54bc8ddc0975f8d6ab5177dbb764ac89 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 3 Jul 2025 19:52:41 +0000 Subject: [PATCH] cleanup Signed-off-by: Sage Moore --- vllm/config.py | 2 +- vllm/forward_context.py | 5 ----- vllm/v1/worker/gpu_model_runner.py | 4 ---- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 1396055997c45..904543250df6e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4578,7 +4578,7 @@ class VllmConfig: # More specifically piecewise cuda-graphs logger.warning_once( "Piecewise compilation is not supported with " - "microbatching. Disabling piecewiseching compilation.") + "microbatching. Disabling piecewise compilation.") self.compilation_config.level = CompilationLevel.NO_COMPILATION disable_chunked_prefill_reasons: list[str] = [] diff --git a/vllm/forward_context.py b/vllm/forward_context.py index ed86247dd9d28..2cdd62c72d581 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -44,9 +44,7 @@ class DPMetadata: device="cpu", dtype=torch.int32) from vllm.distributed.parallel_state import get_dp_group - # logger.info("STARTING AR num_tokens_across_dp") dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group) - # logger.info("finishing num_tokens_across_dp") return num_tokens_tensor @staticmethod @@ -57,16 +55,13 @@ class DPMetadata: device="cpu", dtype=torch.int32) from vllm.distributed.parallel_state import get_dp_group - # logger.info(f"should_ubatch_tensor before ar {should_ubatch_tensor}") dist.all_reduce(should_ubatch_tensor, group=get_dp_group().cpu_group) - # logger.info(f"should_ubatch_tensor after ar {should_ubatch_tensor}") # If there's an incorrect ordering of ARs across DP ranks, this tensor # can end up containing the number of padded tokens for a DP rank assert torch.all(should_ubatch_tensor <= 1) result: bool = bool(torch.all(should_ubatch_tensor == 1).item()) - # print(f"FINISHING AR should_ubatch_across_dp {result} {should_ubatch_tensor}") return result @staticmethod diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index de3ec30121ca7..10b5d42ebaa30 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -823,8 +823,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): )) for layer_name in kv_cache_group_spec.layer_names: assert type(attn_metadata) is list - # assert attn_metadata_i is not None - # What if it's None? Do we still add it to the list? attn_metadata[ubid][layer_name] = attn_metadata_i else: attn_metadata_i = ( @@ -1581,7 +1579,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): @torch.inference_mode() def _ubatch_thread(results, model, ubatch_metadata): - # print(f"Starting Request on ubatch: {ubatch_ctx.id}", flush=True) with ubatch_metadata.context: model_output = model( input_ids=ubatch_metadata.input_ids, @@ -1590,7 +1587,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): inputs_embeds=ubatch_metadata.inputs_embeds, ) results.append((ubatch_metadata.context.id, model_output)) - # print(f"Finishing Request on ubatch: {ubatch_ctx.id}", flush=True) def _run_ubatches(ubatch_metadata, model) -> torch.Tensor: results: list[tuple[int, torch.Tensor]] = []