mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 13:37:23 +08:00
updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
This commit is contained in:
parent
6ac7b874b1
commit
e830434fe2
@ -2,10 +2,10 @@
|
||||
vllm-directory := "/home/rshaw/vllm/"
|
||||
|
||||
launch_dp_ep MODEL SIZE:
|
||||
vllm serve {{MODEL}} --data-parallel-size {{SIZE}} --enable-expert-parallel --disable-log-requests
|
||||
VLLM_ALL2ALL_BACKEND="pplx" vllm serve {{MODEL}} --data-parallel-size {{SIZE}} --enable-expert-parallel --disable-log-requests --max-model-len 32000 --enforce-eager
|
||||
|
||||
launch_tp MODEL SIZE:
|
||||
vllm serve {{MODEL}} --tensor-parallel-size {{SIZE}} --disable-log-requests
|
||||
vllm serve {{MODEL}} --tensor-parallel-size {{SIZE}} --disable-log-requests --max-model-len 32000
|
||||
|
||||
eval MODEL:
|
||||
lm_eval --model local-completions --tasks gsm8k \
|
||||
@ -18,4 +18,19 @@ benchmark MODEL NUM_PROMPTS:
|
||||
--random-input-len 1000 \
|
||||
--random-output-len 100 \
|
||||
--num-prompts {{NUM_PROMPTS}} \
|
||||
--percentile-metrics ttft,tpot,itl,e2el \
|
||||
--metric-percentiles 90,95,99 \
|
||||
--ignore-eos \
|
||||
--seed $(date +%s)
|
||||
|
||||
benchmark_all_decode MODEL NUM_PROMPTS:
|
||||
python {{vllm-directory}}/benchmarks/benchmark_serving.py \
|
||||
--model {{MODEL}} \
|
||||
--dataset-name random \
|
||||
--random-input-len 1 \
|
||||
--random-output-len 1000 \
|
||||
--num-prompts {{NUM_PROMPTS}} \
|
||||
--percentile-metrics ttft,tpot,itl,e2el \
|
||||
--metric-percentiles 90,95,99 \
|
||||
--ignore-eos \
|
||||
--seed $(date +%s)
|
||||
@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Optional
|
||||
|
||||
import time
|
||||
import pplx_kernels as pplx
|
||||
import torch
|
||||
|
||||
@ -197,7 +198,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
# This argument is optional, defaults to indices.size(0)
|
||||
# There's not much point setting this unless it is != indices.size(0)
|
||||
bound_m: Optional[torch.Tensor] = None
|
||||
|
||||
|
||||
start = time.perf_counter()
|
||||
self.a2a.dispatch(
|
||||
out_expert_num_tokens=expert_num_tokens,
|
||||
out_expert_x=expert_x,
|
||||
@ -207,6 +209,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
indices=topk_ids.view(dtype=torch.uint32),
|
||||
bound_m=bound_m,
|
||||
)
|
||||
end = time.perf_counter()
|
||||
logger.info("dispatch took %.3f ms", (end - start) * 1000)
|
||||
|
||||
if expert_x_scale is not None:
|
||||
expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
|
||||
@ -248,8 +252,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
if apply_router_weight_on_input:
|
||||
topk_weights = torch.ones_like(topk_weights)
|
||||
|
||||
start = time.perf_counter()
|
||||
self.a2a.combine(out_tokens=output,
|
||||
indices=topk_ids.view(dtype=torch.uint32),
|
||||
weights=topk_weights,
|
||||
expert_y=fused_expert_output,
|
||||
bound_m=bound_m)
|
||||
end = time.perf_counter()
|
||||
logger.info("combine took %.3f ms", (end - start) * 1000)
|
||||
|
||||
@ -946,6 +946,7 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
|
||||
# We are in a running state and so must execute a dummy pass
|
||||
# if the model didn't execute any ready requests.
|
||||
logger.info("Executing dummy batch for wave %d.", self.current_wave)
|
||||
self.execute_dummy_batch()
|
||||
|
||||
# 3) All-reduce operation to determine global unfinished reqs.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user