mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-02 05:37:54 +08:00
cleanup deepep ll
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
6e2a3c0841
commit
17a7ceef27
@ -131,8 +131,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
|
|
||||||
hidden_size = a1.size(1)
|
hidden_size = a1.size(1)
|
||||||
ubatch_ctx = get_current_ubatch_context()
|
ubatch_ctx = get_current_ubatch_context()
|
||||||
ubatch_id = ubatch_ctx.id if ubatch_ctx is not None else -1
|
a2a_idx = ubatch_ctx.id if ubatch_ctx is not None else 0
|
||||||
a2a_idx = 0 if ubatch_id == -1 else ubatch_id
|
|
||||||
# assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \
|
# assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \
|
||||||
# (f"Hidden Size {hidden_size} not in supported list of hidden sizes"
|
# (f"Hidden Size {hidden_size} not in supported list of hidden sizes"
|
||||||
# f"{self.SUPPORTED_HIDDEN_SIZES}")
|
# f"{self.SUPPORTED_HIDDEN_SIZES}")
|
||||||
@ -155,8 +154,8 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
a1 = a1 * rank_topk_weights.to(a1.dtype)
|
a1 = a1 * rank_topk_weights.to(a1.dtype)
|
||||||
|
|
||||||
# Dispatch
|
# Dispatch
|
||||||
# yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
||||||
expert_x, expert_num_tokens, handle, event, hook = \
|
expert_x, expert_num_tokens, handle, _, _= \
|
||||||
self.buffers[a2a_idx].low_latency_dispatch(a1,
|
self.buffers[a2a_idx].low_latency_dispatch(a1,
|
||||||
rank_topk_ids,
|
rank_topk_ids,
|
||||||
self.max_tokens_per_rank,
|
self.max_tokens_per_rank,
|
||||||
@ -165,7 +164,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
async_finish=False,
|
async_finish=False,
|
||||||
return_recv_hook=False)
|
return_recv_hook=False)
|
||||||
self.handles[a2a_idx] = handle
|
self.handles[a2a_idx] = handle
|
||||||
# yield_and_switch_from_comm_to_compute_impl(schedule="default")
|
yield_and_switch_from_comm_to_compute_impl(schedule="default")
|
||||||
|
|
||||||
expert_x, expert_x_scale = self._do_quant(expert_x, a1_scale, a2_scale,
|
expert_x, expert_x_scale = self._do_quant(expert_x, a1_scale, a2_scale,
|
||||||
a1.dtype)
|
a1.dtype)
|
||||||
@ -177,8 +176,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
apply_router_weight_on_input: bool) -> None:
|
apply_router_weight_on_input: bool) -> None:
|
||||||
|
|
||||||
ubatch_ctx = get_current_ubatch_context()
|
ubatch_ctx = get_current_ubatch_context()
|
||||||
ubatch_id = ubatch_ctx.id if ubatch_ctx is not None else -1
|
a2a_idx = ubatch_ctx.id if ubatch_ctx is not None else 0
|
||||||
a2a_idx = 0 if ubatch_id == -1 else ubatch_id
|
|
||||||
handle = self.handles[a2a_idx]
|
handle = self.handles[a2a_idx]
|
||||||
assert handle is not None
|
assert handle is not None
|
||||||
|
|
||||||
@ -188,8 +186,8 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
combine_topk_weights = torch.ones_like(topk_weights)
|
combine_topk_weights = torch.ones_like(topk_weights)
|
||||||
|
|
||||||
# TODO (varun) : Enable zero copy mode
|
# TODO (varun) : Enable zero copy mode
|
||||||
# yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
||||||
_, event, hook = self.buffers[a2a_idx].low_latency_combine(
|
_ = self.buffers[a2a_idx].low_latency_combine(
|
||||||
fused_expert_output,
|
fused_expert_output,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
combine_topk_weights,
|
combine_topk_weights,
|
||||||
@ -198,6 +196,5 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
zero_copy=False,
|
zero_copy=False,
|
||||||
return_recv_hook=False,
|
return_recv_hook=False,
|
||||||
out=output)
|
out=output)
|
||||||
# event.current_stream_wait()
|
yield_and_switch_from_comm_to_compute_impl(schedule="default")
|
||||||
# yield_and_switch_from_comm_to_compute_impl(schedule="default")
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user