Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
Sage Moore 2025-07-03 19:52:41 +00:00
parent 06cc133a63
commit 908e9f8f54
3 changed files with 1 additions and 10 deletions

View File

@ -4578,7 +4578,7 @@ class VllmConfig:
# More specifically piecewise cuda-graphs
logger.warning_once(
"Piecewise compilation is not supported with "
"microbatching. Disabling piecewiseching compilation.")
"microbatching. Disabling piecewise compilation.")
self.compilation_config.level = CompilationLevel.NO_COMPILATION
disable_chunked_prefill_reasons: list[str] = []

View File

@ -44,9 +44,7 @@ class DPMetadata:
device="cpu",
dtype=torch.int32)
from vllm.distributed.parallel_state import get_dp_group
# logger.info("STARTING AR num_tokens_across_dp")
dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
# logger.info("finishing num_tokens_across_dp")
return num_tokens_tensor
@staticmethod
@ -57,16 +55,13 @@ class DPMetadata:
device="cpu",
dtype=torch.int32)
from vllm.distributed.parallel_state import get_dp_group
# logger.info(f"should_ubatch_tensor before ar {should_ubatch_tensor}")
dist.all_reduce(should_ubatch_tensor, group=get_dp_group().cpu_group)
# logger.info(f"should_ubatch_tensor after ar {should_ubatch_tensor}")
# If there's an incorrect ordering of ARs across DP ranks, this tensor
# can end up containing the number of padded tokens for a DP rank
assert torch.all(should_ubatch_tensor <= 1)
result: bool = bool(torch.all(should_ubatch_tensor == 1).item())
# print(f"FINISHING AR should_ubatch_across_dp {result} {should_ubatch_tensor}")
return result
@staticmethod

View File

@ -823,8 +823,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
))
for layer_name in kv_cache_group_spec.layer_names:
assert type(attn_metadata) is list
# assert attn_metadata_i is not None
# What if it's None? Do we still add it to the list?
attn_metadata[ubid][layer_name] = attn_metadata_i
else:
attn_metadata_i = (
@ -1581,7 +1579,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
@torch.inference_mode()
def _ubatch_thread(results, model, ubatch_metadata):
# print(f"Starting Request on ubatch: {ubatch_ctx.id}", flush=True)
with ubatch_metadata.context:
model_output = model(
input_ids=ubatch_metadata.input_ids,
@ -1590,7 +1587,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
inputs_embeds=ubatch_metadata.inputs_embeds,
)
results.append((ubatch_metadata.context.id, model_output))
# print(f"Finishing Request on ubatch: {ubatch_ctx.id}", flush=True)
def _run_ubatches(ubatch_metadata, model) -> torch.Tensor:
results: list[tuple[int, torch.Tensor]] = []