updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
2026-06-27 04:57:13 +08:00 · 2025-07-15 02:05:26 +00:00 · 2025-07-15 02:05:26 +00:00 · e830434fe2
commit e830434fe2
parent 6ac7b874b1
3 changed files with 26 additions and 3 deletions
--- a/tools/smoke_tests/Justfile
+++ b/tools/smoke_tests/Justfile
@ -2,10 +2,10 @@
 vllm-directory := "/home/rshaw/vllm/"

 launch_dp_ep MODEL SIZE:
-    vllm serve {{MODEL}} --data-parallel-size {{SIZE}} --enable-expert-parallel --disable-log-requests
+    VLLM_ALL2ALL_BACKEND="pplx" vllm serve {{MODEL}} --data-parallel-size {{SIZE}} --enable-expert-parallel --disable-log-requests --max-model-len 32000 --enforce-eager

 launch_tp MODEL SIZE:
-    vllm serve {{MODEL}} --tensor-parallel-size {{SIZE}} --disable-log-requests
+    vllm serve {{MODEL}} --tensor-parallel-size {{SIZE}} --disable-log-requests --max-model-len 32000

 eval MODEL:
  lm_eval --model local-completions --tasks gsm8k \
@ -18,4 +18,19 @@ benchmark MODEL NUM_PROMPTS:
    --random-input-len 1000 \
    --random-output-len 100 \
    --num-prompts {{NUM_PROMPTS}} \
+    --percentile-metrics ttft,tpot,itl,e2el \
+    --metric-percentiles 90,95,99 \
+    --ignore-eos \
+    --seed $(date +%s)
+
+benchmark_all_decode MODEL NUM_PROMPTS:
+  python {{vllm-directory}}/benchmarks/benchmark_serving.py \
+    --model {{MODEL}} \
+    --dataset-name random \
+    --random-input-len 1 \
+    --random-output-len 1000 \
+    --num-prompts {{NUM_PROMPTS}} \
+    --percentile-metrics ttft,tpot,itl,e2el \
+    --metric-percentiles 90,95,99 \
+    --ignore-eos \
    --seed $(date +%s)
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional

+import time
 import pplx_kernels as pplx
 import torch

@ -197,7 +198,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        # This argument is optional, defaults to indices.size(0)
        # There's not much point setting this unless it is != indices.size(0)
        bound_m: Optional[torch.Tensor] = None
-
+        
+        start = time.perf_counter()
        self.a2a.dispatch(
            out_expert_num_tokens=expert_num_tokens,
            out_expert_x=expert_x,
@ -207,6 +209,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
            indices=topk_ids.view(dtype=torch.uint32),
            bound_m=bound_m,
        )
+        end = time.perf_counter()
+        logger.info("dispatch took %.3f ms", (end - start) * 1000)

        if expert_x_scale is not None:
            expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
@ -248,8 +252,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
        if apply_router_weight_on_input:
            topk_weights = torch.ones_like(topk_weights)

+        start = time.perf_counter()
        self.a2a.combine(out_tokens=output,
                         indices=topk_ids.view(dtype=torch.uint32),
                         weights=topk_weights,
                         expert_y=fused_expert_output,
                         bound_m=bound_m)
+        end = time.perf_counter()
+        logger.info("combine took %.3f ms", (end - start) * 1000)
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@ -946,6 +946,7 @@ class DPEngineCoreProc(EngineCoreProc):

                # We are in a running state and so must execute a dummy pass
                # if the model didn't execute any ready requests.
+                logger.info("Executing dummy batch for wave %d.", self.current_wave)
                self.execute_dummy_batch()

            # 3) All-reduce operation to determine global unfinished reqs.