mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-07 19:57:08 +08:00
cleanup
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
06cc133a63
commit
908e9f8f54
@ -4578,7 +4578,7 @@ class VllmConfig:
|
||||
# More specifically piecewise cuda-graphs
|
||||
logger.warning_once(
|
||||
"Piecewise compilation is not supported with "
|
||||
"microbatching. Disabling piecewiseching compilation.")
|
||||
"microbatching. Disabling piecewise compilation.")
|
||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
disable_chunked_prefill_reasons: list[str] = []
|
||||
|
||||
|
||||
@ -44,9 +44,7 @@ class DPMetadata:
|
||||
device="cpu",
|
||||
dtype=torch.int32)
|
||||
from vllm.distributed.parallel_state import get_dp_group
|
||||
# logger.info("STARTING AR num_tokens_across_dp")
|
||||
dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
|
||||
# logger.info("finishing num_tokens_across_dp")
|
||||
return num_tokens_tensor
|
||||
|
||||
@staticmethod
|
||||
@ -57,16 +55,13 @@ class DPMetadata:
|
||||
device="cpu",
|
||||
dtype=torch.int32)
|
||||
from vllm.distributed.parallel_state import get_dp_group
|
||||
# logger.info(f"should_ubatch_tensor before ar {should_ubatch_tensor}")
|
||||
dist.all_reduce(should_ubatch_tensor, group=get_dp_group().cpu_group)
|
||||
# logger.info(f"should_ubatch_tensor after ar {should_ubatch_tensor}")
|
||||
|
||||
# If there's an incorrect ordering of ARs across DP ranks, this tensor
|
||||
# can end up containing the number of padded tokens for a DP rank
|
||||
assert torch.all(should_ubatch_tensor <= 1)
|
||||
|
||||
result: bool = bool(torch.all(should_ubatch_tensor == 1).item())
|
||||
# print(f"FINISHING AR should_ubatch_across_dp {result} {should_ubatch_tensor}")
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
|
||||
@ -823,8 +823,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
))
|
||||
for layer_name in kv_cache_group_spec.layer_names:
|
||||
assert type(attn_metadata) is list
|
||||
# assert attn_metadata_i is not None
|
||||
# What if it's None? Do we still add it to the list?
|
||||
attn_metadata[ubid][layer_name] = attn_metadata_i
|
||||
else:
|
||||
attn_metadata_i = (
|
||||
@ -1581,7 +1579,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
@torch.inference_mode()
|
||||
def _ubatch_thread(results, model, ubatch_metadata):
|
||||
# print(f"Starting Request on ubatch: {ubatch_ctx.id}", flush=True)
|
||||
with ubatch_metadata.context:
|
||||
model_output = model(
|
||||
input_ids=ubatch_metadata.input_ids,
|
||||
@ -1590,7 +1587,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
inputs_embeds=ubatch_metadata.inputs_embeds,
|
||||
)
|
||||
results.append((ubatch_metadata.context.id, model_output))
|
||||
# print(f"Finishing Request on ubatch: {ubatch_ctx.id}", flush=True)
|
||||
|
||||
def _run_ubatches(ubatch_metadata, model) -> torch.Tensor:
|
||||
results: list[tuple[int, torch.Tensor]] = []
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user