diff --git a/tools/smoke_tests/Justfile b/tools/smoke_tests/Justfile index 8ef526f4dd505..7ec6dbb185eb7 100644 --- a/tools/smoke_tests/Justfile +++ b/tools/smoke_tests/Justfile @@ -2,10 +2,10 @@ vllm-directory := "/home/rshaw/vllm/" launch_dp_ep MODEL SIZE: - vllm serve {{MODEL}} --data-parallel-size {{SIZE}} --enable-expert-parallel --disable-log-requests + VLLM_ALL2ALL_BACKEND="pplx" vllm serve {{MODEL}} --data-parallel-size {{SIZE}} --enable-expert-parallel --disable-log-requests --max-model-len 32000 --enforce-eager launch_tp MODEL SIZE: - vllm serve {{MODEL}} --tensor-parallel-size {{SIZE}} --disable-log-requests + vllm serve {{MODEL}} --tensor-parallel-size {{SIZE}} --disable-log-requests --max-model-len 32000 eval MODEL: lm_eval --model local-completions --tasks gsm8k \ @@ -18,4 +18,19 @@ benchmark MODEL NUM_PROMPTS: --random-input-len 1000 \ --random-output-len 100 \ --num-prompts {{NUM_PROMPTS}} \ + --percentile-metrics ttft,tpot,itl,e2el \ + --metric-percentiles 90,95,99 \ + --ignore-eos \ + --seed $(date +%s) + +benchmark_all_decode MODEL NUM_PROMPTS: + python {{vllm-directory}}/benchmarks/benchmark_serving.py \ + --model {{MODEL}} \ + --dataset-name random \ + --random-input-len 1 \ + --random-output-len 1000 \ + --num-prompts {{NUM_PROMPTS}} \ + --percentile-metrics ttft,tpot,itl,e2el \ + --metric-percentiles 90,95,99 \ + --ignore-eos \ --seed $(date +%s) \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 5a23a9f1ab09d..f890ecae040e9 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional +import time import pplx_kernels as pplx import torch @@ -197,7 +198,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): # This argument is optional, defaults to indices.size(0) # There's not much point setting this unless it is != indices.size(0) bound_m: Optional[torch.Tensor] = None - + + start = time.perf_counter() self.a2a.dispatch( out_expert_num_tokens=expert_num_tokens, out_expert_x=expert_x, @@ -207,6 +209,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): indices=topk_ids.view(dtype=torch.uint32), bound_m=bound_m, ) + end = time.perf_counter() + logger.info("dispatch took %.3f ms", (end - start) * 1000) if expert_x_scale is not None: expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape] @@ -248,8 +252,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): if apply_router_weight_on_input: topk_weights = torch.ones_like(topk_weights) + start = time.perf_counter() self.a2a.combine(out_tokens=output, indices=topk_ids.view(dtype=torch.uint32), weights=topk_weights, expert_y=fused_expert_output, bound_m=bound_m) + end = time.perf_counter() + logger.info("combine took %.3f ms", (end - start) * 1000) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e2fdf6f8a11c7..0e6e35690d644 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -946,6 +946,7 @@ class DPEngineCoreProc(EngineCoreProc): # We are in a running state and so must execute a dummy pass # if the model didn't execute any ready requests. + logger.info("Executing dummy batch for wave %d.", self.current_wave) self.execute_dummy_batch() # 3) All-reduce operation to determine global unfinished reqs.