wip

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2026-05-21 18:17:02 +08:00 · 2025-05-23 03:31:49 +00:00 · 2025-05-23 03:31:49 +00:00 · 18bf91e6a8
commit 18bf91e6a8
parent 00f526f55b
4 changed files with 17 additions and 5 deletions
--- a/examples/basic-ub.py
+++ b/examples/basic-ub.py
@ -40,11 +40,11 @@ def main():
              max_model_len=1024,
              #load_format="dummy",
              ###############
-              tensor_parallel_size=1,
-              #data_parallel_size=2,
-              enable_expert_parallel=False,
+              #tensor_parallel_size=1,
+              data_parallel_size=2,
+              enable_expert_parallel=True,
              ###############
-              enable_microbatching=True, 
+              #enable_microbatching=True, 
    )
    # Generate texts from the prompts.
    # The output is a list of RequestOutput objects
--- a/vllm/config.py
+++ b/vllm/config.py
@ -4332,7 +4332,7 @@ class VllmConfig:
                logger.warning_once(
                    "Piecewise compilation is not supported with "
                    "microbatching. Disabling piecewiseching compilation.")
-                self.compilation_config.level = CompilationLevel.DYNAMO_ONCE
+                self.compilation_config.level = CompilationLevel.NO_COMPILATION
             

        if self.model_config and self.model_config.use_mla and \
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@ -7,6 +7,7 @@ import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.utils import (
    moe_kernel_quantize_input)
+from vllm.v1.worker.ubatching import get_current_ubatch_context, yield_impl


 # Note use: layer.get_all_to_all() to get an AllToAll instance
@ -117,7 +118,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                do_send=send,
                do_recv=not send,
            )
+        
+        # if ubatch_ctx is not None:
+        #     ubatch_ctx.gpu_stream_wait()
        dispatch(True) # Send
+        yield_impl(gpu_wait=False)
        dispatch(False) # Recv

        return expert_x, expert_x_scale, expert_num_tokens
@ -155,5 +160,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                do_send=send,
                do_recv=not send,
            )
+        # if ubatch_ctx is not None:
+        #     ubatch_ctx.gpu_stream_wait()
        combine(True)
+        yield_impl(gpu_wait=False)
        combine(False)
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@ -49,6 +49,7 @@ from vllm.model_executor.model_loader.weight_utils import (
    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.v1.worker.ubatching import get_current_ubatch_context

 from .interfaces import SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
@ -656,6 +657,9 @@ class DeepseekV2Model(nn.Module):
        intermediate_tensors: Optional[IntermediateTensors],
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if ubatch_ctx := get_current_ubatch_context() is not None:
+            print("in forward, ubatch:", ubatch_ctx.id)
+        
        if get_pp_group().is_first_rank:
            if inputs_embeds is not None:
                hidden_states = inputs_embeds