mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 23:27:12 +08:00
debugging hang
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
parent
2dc3b8b0a2
commit
9edd08231b
@ -661,6 +661,8 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
f"Hidden size mismatch {hidden_states.size(-1)} "
|
||||
f"!= {w1.size(2)}")
|
||||
|
||||
print("in batched triton experts", hidden_states.shape, expert_num_tokens)
|
||||
|
||||
assert hidden_states.is_contiguous(
|
||||
), "Hidden_states must be contiguous"
|
||||
assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
|
||||
|
||||
@ -29,6 +29,7 @@ from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms.interface import CpuArchEnum
|
||||
from vllm.utils import direct_register_custom_op
|
||||
from vllm.v1.worker.ubatching import get_current_ubatch_context
|
||||
|
||||
has_pplx = importlib.util.find_spec("pplx_kernels") is not None
|
||||
|
||||
@ -442,7 +443,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
|
||||
if isinstance(prepare_finalize,
|
||||
(BatchedPrepareAndFinalize, PplxPrepareAndFinalize)):
|
||||
logger.debug("BatchedTritonExperts %s", self.moe)
|
||||
print("BatchedTritonExperts %s", self.moe)
|
||||
|
||||
experts = BatchedTritonExperts(
|
||||
max_num_tokens=MOE_DP_CHUNK_SIZE,
|
||||
world_size=world_size,
|
||||
@ -454,7 +456,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
block_shape=None,
|
||||
)
|
||||
else:
|
||||
logger.debug("TritonExperts %s", self.moe)
|
||||
print("TritonExperts %s", self.moe)
|
||||
experts = TritonExperts(
|
||||
use_fp8_w8a8=False,
|
||||
use_int8_w8a8=False,
|
||||
@ -1298,6 +1300,9 @@ class FusedMoE(torch.nn.Module):
|
||||
max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
|
||||
moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE
|
||||
|
||||
if (ubatch_ctdx := get_current_ubatch_context()) is not None:
|
||||
print("in fused moe, ubatch:", ubatch_ctdx.id, "chunk size:", max_tokens_across_dp, "moe_dp_chunk_size_per_rank", moe_dp_chunk_size_per_rank)
|
||||
|
||||
num_tokens = full_hidden_states.size(0)
|
||||
for chunk_start_ in range(0, max_tokens_across_dp,
|
||||
moe_dp_chunk_size_per_rank):
|
||||
@ -1396,6 +1401,8 @@ def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
|
||||
forward_context: ForwardContext = get_forward_context()
|
||||
self = forward_context.no_compile_layers[layer_name]
|
||||
assert self.quant_method is not None
|
||||
if (ubatch_ctx := get_current_ubatch_context()) is not None:
|
||||
print("in fused moe, ubatch:", ubatch_ctx.id, self)
|
||||
|
||||
return self.forward_impl(hidden_states, router_logits)
|
||||
|
||||
|
||||
@ -3,6 +3,7 @@ from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from vllm.v1.worker.ubatching import get_current_ubatch_context
|
||||
|
||||
#
|
||||
# This file defines a set of base classes used to make MoE kernels more modular.
|
||||
@ -335,10 +336,20 @@ class FusedMoEModularKernel(torch.nn.Module):
|
||||
device=a1.device,
|
||||
dtype=workspace_dtype)
|
||||
|
||||
if (ubatch_ctx := get_current_ubatch_context()) is not None:
|
||||
print("in modular moe, ubatch:", ubatch_ctx.id)
|
||||
|
||||
a1q, a1q_scale, expert_num_tokens = self.prepare_finalize.prepare(
|
||||
a1, a1_scale, a2_scale, topk_weights, topk_ids, global_num_experts,
|
||||
expert_map, apply_router_weight_on_input)
|
||||
|
||||
if (ubatch_ctx := get_current_ubatch_context()) is not None:
|
||||
print("in modular moe2, ubatch:", ubatch_ctx.id, self.fused_experts)
|
||||
|
||||
print("pre synchronize")
|
||||
torch.cuda.synchronize(a1.device)
|
||||
print("post synchronize")
|
||||
|
||||
fused_out = self.fused_experts.apply(
|
||||
a1q,
|
||||
w1,
|
||||
@ -358,6 +369,9 @@ class FusedMoEModularKernel(torch.nn.Module):
|
||||
expert_num_tokens=expert_num_tokens,
|
||||
)
|
||||
|
||||
if (ubatch_ctx := get_current_ubatch_context()) is not None:
|
||||
print("in modular moe3, ubatch:", ubatch_ctx.id, self.fused_experts)
|
||||
|
||||
self.prepare_finalize.finalize(output, fused_out, topk_weights,
|
||||
topk_ids, apply_router_weight_on_input)
|
||||
|
||||
|
||||
@ -119,11 +119,14 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
do_recv=not send,
|
||||
)
|
||||
|
||||
# if ubatch_ctx is not None:
|
||||
# ubatch_ctx.gpu_stream_wait()
|
||||
print("Dispatch pre-wait")
|
||||
if (ubatch_ctx := get_current_ubatch_context()) is not None:
|
||||
ubatch_ctx.gpu_stream_wait()
|
||||
print("Dispatch launched")
|
||||
dispatch(True) # Send
|
||||
yield_impl(gpu_wait=False)
|
||||
dispatch(False) # Recv
|
||||
print("Finished dispatch")
|
||||
|
||||
return expert_x, expert_x_scale, expert_num_tokens
|
||||
|
||||
@ -160,8 +163,12 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
do_send=send,
|
||||
do_recv=not send,
|
||||
)
|
||||
# if ubatch_ctx is not None:
|
||||
# ubatch_ctx.gpu_stream_wait()
|
||||
|
||||
print("Combine pre-wait")
|
||||
if (ubatch_ctx := get_current_ubatch_context()) is not None:
|
||||
ubatch_ctx.gpu_stream_wait()
|
||||
combine(True)
|
||||
print("Combine launched")
|
||||
yield_impl(gpu_wait=False)
|
||||
combine(False)
|
||||
print("Finished combine")
|
||||
|
||||
@ -564,6 +564,8 @@ class DeepseekV2DecoderLayer(nn.Module):
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
if (ubatch_ctx := get_current_ubatch_context()) is not None:
|
||||
print("in decoder, ubatch:", ubatch_ctx.id)
|
||||
# Self Attention
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
@ -657,7 +659,7 @@ class DeepseekV2Model(nn.Module):
|
||||
intermediate_tensors: Optional[IntermediateTensors],
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
) -> Union[torch.Tensor, IntermediateTensors]:
|
||||
if ubatch_ctx := get_current_ubatch_context() is not None:
|
||||
if (ubatch_ctx := get_current_ubatch_context()) is not None:
|
||||
print("in forward, ubatch:", ubatch_ctx.id)
|
||||
|
||||
if get_pp_group().is_first_rank:
|
||||
|
||||
@ -1287,10 +1287,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
scheduler_output: Optional["SchedulerOutput"] = None,
|
||||
is_dummy_run: bool = False):
|
||||
|
||||
num_dummy_tokens = num_scheduled_tokens if is_dummy_run else 1
|
||||
|
||||
def model_inputs(tokens_slice: slice, use_dummy_input: bool) -> tuple:
|
||||
if use_dummy_input:
|
||||
num_tokens = num_scheduled_tokens or 1
|
||||
return self._get_dummy_model_inputs(num_tokens)
|
||||
return self._get_dummy_model_inputs(num_dummy_tokens)
|
||||
else:
|
||||
assert scheduler_output is not None
|
||||
return self._get_model_inputs(tokens_slice, scheduler_output)
|
||||
@ -1301,7 +1302,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
model_inputs(token_slice, use_dummy_input)
|
||||
with context:
|
||||
if isinstance(context, UBatchContext):
|
||||
print(f"Running ubatch {context.id} with input_ids {input_ids.shape} and positions {positions.shape}")
|
||||
print(f"Running ubatch {context.id} with input_ids {input_ids.shape} and positions {positions.shape} use_dummy_input {use_dummy_input} token_slice {token_slice}")
|
||||
model_output = self.model(
|
||||
input_ids=input_ids,
|
||||
positions=positions,
|
||||
@ -1333,19 +1334,27 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
if not hasattr(self, "ubatch_streams"):
|
||||
# Create the ubatch streams
|
||||
self.ubatch_streams = [torch.cuda.Stream(self.device) for _ in range(len(ubatch_slices))]
|
||||
|
||||
ubatch_fwd_ctxs = [create_forward_context(
|
||||
attn_metadata[i] if attn_metadata is not None else None,
|
||||
self.vllm_config, num_tokens=(tokens_slice.stop - tokens_slice.start)
|
||||
) for i, (_, tokens_slice) in enumerate(ubatch_slices)]
|
||||
|
||||
|
||||
# We have to be careful creating the forward contexts here otherwise we can end
|
||||
# up with the dummy contexts have num_tokens set to 0
|
||||
# ubatch_fwd_ctxs = [create_forward_context(
|
||||
# attn_metadata[i] if attn_metadata is not None else None,
|
||||
# self.vllm_config, num_tokens=(tokens_slice.stop - tokens_slice.start)
|
||||
# ) for i, (_, tokens_slice) in enumerate(ubatch_slices)]
|
||||
ubatch_ctxs, start_hook = make_ubatch_context_chain(
|
||||
len(ubatch_slices),
|
||||
fwd_ctxs=ubatch_fwd_ctxs,
|
||||
#fwd_ctxs=ubatch_fwd_ctxs,
|
||||
streams=self.ubatch_streams, #stream=root_stream, # Only works currently if everything is run on the same stream
|
||||
device=self.device)
|
||||
setup_done = threading.Event()
|
||||
ubatch_threads = []
|
||||
|
||||
# Initialize Events? not sure if this helps
|
||||
for ubatch_ctx in ubatch_ctxs:
|
||||
ubatch_ctx.gpu_wait_event.record(ubatch_ctx.stream)
|
||||
ubatch_ctx.stream.wait_event(ubatch_ctx.gpu_wait_event)
|
||||
|
||||
# Ubatches will manually manage the forward context, so we override
|
||||
# it to None here so we can have it restored correctly later
|
||||
with override_forward_context(None):
|
||||
@ -1354,6 +1363,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
is_dummy_ubatch = tokens_slice.stop <= tokens_slice.start
|
||||
assert not is_dummy_ubatch or i == len(ubatch_slices) - 1 or is_dummy_run
|
||||
|
||||
print("ubatch", i, "tokens slice", tokens_slice, "is dummy ubatch", is_dummy_ubatch, "is dummy run", is_dummy_run)
|
||||
|
||||
num_tokens = num_dummy_tokens if is_dummy_ubatch or is_dummy_run else (tokens_slice.stop - tokens_slice.start)
|
||||
ubatch_ctxs[i].forward_context = create_forward_context(
|
||||
attn_metadata[i] if attn_metadata is not None else None,
|
||||
self.vllm_config, num_tokens=num_tokens)
|
||||
|
||||
thread = threading.Thread(target=_ubatch_thread, args=(
|
||||
ubatch_ctxs[i],
|
||||
root_stream,
|
||||
|
||||
@ -17,7 +17,7 @@ class UBatchContext:
|
||||
def __init__(self,
|
||||
id: int,
|
||||
stream: torch.cuda.Stream,
|
||||
fwd_ctx: forward_context.ForwardContext,
|
||||
#fwd_ctx: forward_context.ForwardContext,
|
||||
cpu_wait_event: threading.Event,
|
||||
cpu_signal_event: threading.Event,
|
||||
gpu_wait_event: torch.cuda.Event,
|
||||
@ -27,7 +27,7 @@ class UBatchContext:
|
||||
self.id = id
|
||||
self.stream = stream
|
||||
self.original_stream = current_stream()
|
||||
self.forward_context = fwd_ctx
|
||||
self.forward_context = None #fwd_ctx
|
||||
self.cpu_wait_event = cpu_wait_event
|
||||
self.cpu_signal_event = cpu_signal_event
|
||||
self.gpu_wait_event = gpu_wait_event
|
||||
@ -80,6 +80,7 @@ class UBatchContext:
|
||||
# until ubatch0-dispatch is done avoiding overlapping dispatches that
|
||||
# might share underlying buffers
|
||||
def gpu_stream_wait(self):
|
||||
print("Waiting ubatch %d on %s in stream %s" % (self.id, self.gpu_wait_event, self.stream))
|
||||
self.stream.wait_event(self.gpu_wait_event)
|
||||
|
||||
def _yield(self, gpu_wait: bool = True):
|
||||
@ -92,6 +93,7 @@ class UBatchContext:
|
||||
|
||||
def _signal(self):
|
||||
# Wait for the next batch to signal back
|
||||
print(f"signaling ubatch {self.id} to {self.gpu_signal_event} on {self.stream}")
|
||||
self.gpu_signal_event.record(self.stream)
|
||||
# Signal that this batch reached the barrier
|
||||
self.cpu_signal_event.set()
|
||||
@ -134,7 +136,7 @@ def yield_(x: torch.Tensor, schedule: str="default") -> None:
|
||||
"""
|
||||
def make_ubatch_context_chain(
|
||||
num_micro_batches: int,
|
||||
fwd_ctxs: forward_context.ForwardContext,
|
||||
#fwd_ctxs: forward_context.ForwardContext,
|
||||
streams: Optional[list[torch.Stream]] = None,
|
||||
device: Optional[torch.device] = None
|
||||
) -> list[UBatchContext]:
|
||||
@ -152,7 +154,7 @@ def make_ubatch_context_chain(
|
||||
stream = (streams[i] if streams else None) or torch.cuda.Stream(device)
|
||||
ctx = UBatchContext(id=i,
|
||||
stream=stream,
|
||||
fwd_ctx=fwd_ctxs[i],
|
||||
#fwd_ctx=fwd_ctxs[i],
|
||||
cpu_wait_event=cpu_events[i],
|
||||
cpu_signal_event=cpu_events[(i + 1) % num_micro_batches],
|
||||
gpu_wait_event=gpu_events[i],
|
||||
@ -163,6 +165,7 @@ def make_ubatch_context_chain(
|
||||
|
||||
def start_hook(from_stream: torch.cuda.Stream):
|
||||
ctxs[0].gpu_wait_event.record(from_stream)
|
||||
print('singal to ubatch %d event %s from stream %s' % (ctxs[0].id, ctxs[0].gpu_wait_event, from_stream))
|
||||
ctxs[0].cpu_wait_event.set()
|
||||
|
||||
return ctxs, start_hook
|
||||
Loading…
x
Reference in New Issue
Block a user