cleanup

Signed-off-by: Sage Moore <sage@neuralmagic.com>
2026-05-30 13:07:08 +08:00 · 2025-07-03 19:52:41 +00:00 · 2025-07-03 19:52:41 +00:00 · 908e9f8f54
commit 908e9f8f54
parent 06cc133a63
3 changed files with 1 additions and 10 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@ -4578,7 +4578,7 @@ class VllmConfig:
            #  More specifically piecewise cuda-graphs
            logger.warning_once(
                "Piecewise compilation is not supported with "
-                "microbatching. Disabling piecewiseching compilation.")
+                "microbatching. Disabling piecewise compilation.")
            self.compilation_config.level = CompilationLevel.NO_COMPILATION
        disable_chunked_prefill_reasons: list[str] = []
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@ -44,9 +44,7 @@ class DPMetadata:
                                         device="cpu",
                                         dtype=torch.int32)
        from vllm.distributed.parallel_state import get_dp_group
        # logger.info("STARTING AR num_tokens_across_dp")
        dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
        # logger.info("finishing num_tokens_across_dp")
        return num_tokens_tensor
    @staticmethod
@ -57,16 +55,13 @@ class DPMetadata:
                                         device="cpu",
                                         dtype=torch.int32)
        from vllm.distributed.parallel_state import get_dp_group
        # logger.info(f"should_ubatch_tensor before ar {should_ubatch_tensor}")
        dist.all_reduce(should_ubatch_tensor, group=get_dp_group().cpu_group)
        # logger.info(f"should_ubatch_tensor after ar {should_ubatch_tensor}")
        # If there's an incorrect ordering of ARs across DP ranks, this tensor 
        # can end up containing the number of padded tokens for a DP rank
        assert torch.all(should_ubatch_tensor <= 1)
        result: bool = bool(torch.all(should_ubatch_tensor == 1).item())
        # print(f"FINISHING AR should_ubatch_across_dp {result} {should_ubatch_tensor}")
        return result
    @staticmethod
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -823,8 +823,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                            ))
                    for layer_name in kv_cache_group_spec.layer_names:
                        assert type(attn_metadata) is list
                        # assert attn_metadata_i is not None
                        # What if it's None? Do we still add it to the list?
                        attn_metadata[ubid][layer_name] = attn_metadata_i
            else:
                attn_metadata_i = (
@ -1581,7 +1579,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        @torch.inference_mode()
        def _ubatch_thread(results, model, ubatch_metadata):
            # print(f"Starting Request on ubatch: {ubatch_ctx.id}", flush=True)
            with ubatch_metadata.context:
                model_output = model(
                    input_ids=ubatch_metadata.input_ids,
@ -1590,7 +1587,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                    inputs_embeds=ubatch_metadata.inputs_embeds,
                )
            results.append((ubatch_metadata.context.id, model_output))
            # print(f"Finishing Request on ubatch: {ubatch_ctx.id}", flush=True)
        def _run_ubatches(ubatch_metadata, model) -> torch.Tensor:
            results: list[tuple[int, torch.Tensor]] = []