mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-22 12:57:55 +08:00
pplx format
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
243eac58a4
commit
d46397661f
@ -661,8 +661,6 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
|||||||
f"Hidden size mismatch {hidden_states.size(-1)} "
|
f"Hidden size mismatch {hidden_states.size(-1)} "
|
||||||
f"!= {w1.size(2)}")
|
f"!= {w1.size(2)}")
|
||||||
|
|
||||||
# print("in batched triton experts", hidden_states.shape, expert_num_tokens)
|
|
||||||
|
|
||||||
assert hidden_states.is_contiguous(
|
assert hidden_states.is_contiguous(
|
||||||
), "Hidden_states must be contiguous"
|
), "Hidden_states must be contiguous"
|
||||||
assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
|
assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
|
||||||
|
|||||||
@ -8,9 +8,8 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
|
|||||||
from vllm.model_executor.layers.fused_moe.utils import (
|
from vllm.model_executor.layers.fused_moe.utils import (
|
||||||
moe_kernel_quantize_input)
|
moe_kernel_quantize_input)
|
||||||
from vllm.v1.worker.ubatching import (
|
from vllm.v1.worker.ubatching import (
|
||||||
get_current_ubatch_context, yield_and_switch_from_compute_to_comm_impl,
|
get_current_ubatch_context, yield_and_switch_from_comm_to_compute_impl,
|
||||||
yield_and_switch_from_comm_to_compute_impl
|
yield_and_switch_from_compute_to_comm_impl)
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Note use: layer.get_all_to_all() to get an AllToAll instance
|
# Note use: layer.get_all_to_all() to get an AllToAll instance
|
||||||
@ -128,10 +127,10 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
ubatch_ctx = get_current_ubatch_context()
|
ubatch_ctx = get_current_ubatch_context()
|
||||||
ubatch_id = ubatch_ctx.id if ubatch_ctx is not None else -1
|
ubatch_id = ubatch_ctx.id if ubatch_ctx is not None else -1
|
||||||
yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
||||||
dispatch(True) # Send
|
dispatch(True) # Send
|
||||||
# torch.cuda.synchronize()
|
# torch.cuda.synchronize()
|
||||||
# print(f"{ubatch_id} AFTER SEND SYNC", flush=True)
|
# print(f"{ubatch_id} AFTER SEND SYNC", flush=True)
|
||||||
dispatch(False) # Recv
|
dispatch(False) # Recv
|
||||||
# torch.cuda.synchronize()
|
# torch.cuda.synchronize()
|
||||||
# print(f"{ubatch_id} AFTER RECV SYNC", flush=True)
|
# print(f"{ubatch_id} AFTER RECV SYNC", flush=True)
|
||||||
yield_and_switch_from_comm_to_compute_impl(schedule="default")
|
yield_and_switch_from_comm_to_compute_impl(schedule="default")
|
||||||
@ -174,6 +173,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
do_send=send,
|
do_send=send,
|
||||||
do_recv=not send,
|
do_recv=not send,
|
||||||
)
|
)
|
||||||
|
|
||||||
yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
||||||
combine(True)
|
combine(True)
|
||||||
# torch.cuda.synchronize()
|
# torch.cuda.synchronize()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user