diff --git a/tools/smoke_tests/Justfile b/tools/smoke_tests/Justfile
index 8ef526f4dd505..7ec6dbb185eb7 100644
--- a/tools/smoke_tests/Justfile
+++ b/tools/smoke_tests/Justfile
@@ -2,10 +2,10 @@
 vllm-directory := "/home/rshaw/vllm/"
 
 launch_dp_ep MODEL SIZE:
-    vllm serve {{MODEL}} --data-parallel-size {{SIZE}} --enable-expert-parallel --disable-log-requests
+    VLLM_ALL2ALL_BACKEND="pplx" vllm serve {{MODEL}} --data-parallel-size {{SIZE}} --enable-expert-parallel --disable-log-requests --max-model-len 32000 --enforce-eager
 
 launch_tp MODEL SIZE:
-    vllm serve {{MODEL}} --tensor-parallel-size {{SIZE}} --disable-log-requests
+    vllm serve {{MODEL}} --tensor-parallel-size {{SIZE}} --disable-log-requests --max-model-len 32000
 
 eval MODEL:
   lm_eval --model local-completions --tasks gsm8k \
@@ -18,4 +18,19 @@ benchmark MODEL NUM_PROMPTS:
     --random-input-len 1000 \
     --random-output-len 100 \
     --num-prompts {{NUM_PROMPTS}} \
+    --percentile-metrics ttft,tpot,itl,e2el \
+    --metric-percentiles 90,95,99 \
+    --ignore-eos \
+    --seed $(date +%s)
+
+benchmark_all_decode MODEL NUM_PROMPTS:
+  python {{vllm-directory}}/benchmarks/benchmark_serving.py \
+    --model {{MODEL}} \
+    --dataset-name random \
+    --random-input-len 1 \
+    --random-output-len 1000 \
+    --num-prompts {{NUM_PROMPTS}} \
+    --percentile-metrics ttft,tpot,itl,e2el \
+    --metric-percentiles 90,95,99 \
+    --ignore-eos \
     --seed $(date +%s)
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 5a23a9f1ab09d..f890ecae040e9 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 
+import time
 import pplx_kernels as pplx
 import torch
 
@@ -197,7 +198,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         # This argument is optional, defaults to indices.size(0)
         # There's not much point setting this unless it is != indices.size(0)
         bound_m: Optional[torch.Tensor] = None
-
+        
+        start = time.perf_counter()
         self.a2a.dispatch(
             out_expert_num_tokens=expert_num_tokens,
             out_expert_x=expert_x,
@@ -207,6 +209,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             indices=topk_ids.view(dtype=torch.uint32),
             bound_m=bound_m,
         )
+        end = time.perf_counter()
+        logger.info("dispatch took %.3f ms", (end - start) * 1000)
 
         if expert_x_scale is not None:
             expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
@@ -248,8 +252,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         if apply_router_weight_on_input:
             topk_weights = torch.ones_like(topk_weights)
 
+        start = time.perf_counter()
         self.a2a.combine(out_tokens=output,
                          indices=topk_ids.view(dtype=torch.uint32),
                          weights=topk_weights,
                          expert_y=fused_expert_output,
                          bound_m=bound_m)
+        end = time.perf_counter()
+        logger.info("combine took %.3f ms", (end - start) * 1000)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e2fdf6f8a11c7..0e6e35690d644 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -946,6 +946,7 @@ class DPEngineCoreProc(EngineCoreProc):
 
                 # We are in a running state and so must execute a dummy pass
                 # if the model didn't execute any ready requests.
+                logger.info("Executing dummy batch for wave %d.", self.current_wave)
                 self.execute_dummy_batch()
 
             # 3) All-reduce operation to determine global unfinished reqs.