diff --git a/examples/basic-ub.py b/examples/basic-ub.py
index 397f586b6598b..3f6fd2fdb82ea 100644
--- a/examples/basic-ub.py
+++ b/examples/basic-ub.py
@@ -40,11 +40,11 @@ def main():
               max_model_len=1024,
               #load_format="dummy",
               ###############
-              tensor_parallel_size=1,
-              #data_parallel_size=2,
-              enable_expert_parallel=False,
+              #tensor_parallel_size=1,
+              data_parallel_size=2,
+              enable_expert_parallel=True,
               ###############
-              enable_microbatching=True, 
+              #enable_microbatching=True, 
     )
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
diff --git a/vllm/config.py b/vllm/config.py
index 42662ef9e6671..08de3fe32710e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4332,7 +4332,7 @@ class VllmConfig:
                 logger.warning_once(
                     "Piecewise compilation is not supported with "
                     "microbatching. Disabling piecewiseching compilation.")
-                self.compilation_config.level = CompilationLevel.DYNAMO_ONCE
+                self.compilation_config.level = CompilationLevel.NO_COMPILATION
              
 
         if self.model_config and self.model_config.use_mla and \
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 621eb8bdd9c77..b2774382ef6f0 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -7,6 +7,7 @@ import torch
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.utils import (
     moe_kernel_quantize_input)
+from vllm.v1.worker.ubatching import get_current_ubatch_context, yield_impl
 
 
 # Note use: layer.get_all_to_all() to get an AllToAll instance
@@ -117,7 +118,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 do_send=send,
                 do_recv=not send,
             )
+        
+        # if ubatch_ctx is not None:
+        #     ubatch_ctx.gpu_stream_wait()
         dispatch(True) # Send
+        yield_impl(gpu_wait=False)
         dispatch(False) # Recv
 
         return expert_x, expert_x_scale, expert_num_tokens
@@ -155,5 +160,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 do_send=send,
                 do_recv=not send,
             )
+        # if ubatch_ctx is not None:
+        #     ubatch_ctx.gpu_stream_wait()
         combine(True)
+        yield_impl(gpu_wait=False)
         combine(False)
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index b78c193c1345a..1cb22a109dacc 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -49,6 +49,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.v1.worker.ubatching import get_current_ubatch_context
 
 from .interfaces import SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
@@ -656,6 +657,9 @@ class DeepseekV2Model(nn.Module):
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
+        if ubatch_ctx := get_current_ubatch_context() is not None:
+            print("in forward, ubatch:", ubatch_ctx.id)
+        
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds