mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-03 02:57:54 +08:00
updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
This commit is contained in:
parent
6ac7b874b1
commit
e830434fe2
@ -2,10 +2,10 @@
|
|||||||
vllm-directory := "/home/rshaw/vllm/"
|
vllm-directory := "/home/rshaw/vllm/"
|
||||||
|
|
||||||
launch_dp_ep MODEL SIZE:
|
launch_dp_ep MODEL SIZE:
|
||||||
vllm serve {{MODEL}} --data-parallel-size {{SIZE}} --enable-expert-parallel --disable-log-requests
|
VLLM_ALL2ALL_BACKEND="pplx" vllm serve {{MODEL}} --data-parallel-size {{SIZE}} --enable-expert-parallel --disable-log-requests --max-model-len 32000 --enforce-eager
|
||||||
|
|
||||||
launch_tp MODEL SIZE:
|
launch_tp MODEL SIZE:
|
||||||
vllm serve {{MODEL}} --tensor-parallel-size {{SIZE}} --disable-log-requests
|
vllm serve {{MODEL}} --tensor-parallel-size {{SIZE}} --disable-log-requests --max-model-len 32000
|
||||||
|
|
||||||
eval MODEL:
|
eval MODEL:
|
||||||
lm_eval --model local-completions --tasks gsm8k \
|
lm_eval --model local-completions --tasks gsm8k \
|
||||||
@ -18,4 +18,19 @@ benchmark MODEL NUM_PROMPTS:
|
|||||||
--random-input-len 1000 \
|
--random-input-len 1000 \
|
||||||
--random-output-len 100 \
|
--random-output-len 100 \
|
||||||
--num-prompts {{NUM_PROMPTS}} \
|
--num-prompts {{NUM_PROMPTS}} \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el \
|
||||||
|
--metric-percentiles 90,95,99 \
|
||||||
|
--ignore-eos \
|
||||||
|
--seed $(date +%s)
|
||||||
|
|
||||||
|
benchmark_all_decode MODEL NUM_PROMPTS:
|
||||||
|
python {{vllm-directory}}/benchmarks/benchmark_serving.py \
|
||||||
|
--model {{MODEL}} \
|
||||||
|
--dataset-name random \
|
||||||
|
--random-input-len 1 \
|
||||||
|
--random-output-len 1000 \
|
||||||
|
--num-prompts {{NUM_PROMPTS}} \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el \
|
||||||
|
--metric-percentiles 90,95,99 \
|
||||||
|
--ignore-eos \
|
||||||
--seed $(date +%s)
|
--seed $(date +%s)
|
||||||
@ -2,6 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
import time
|
||||||
import pplx_kernels as pplx
|
import pplx_kernels as pplx
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -197,7 +198,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
# This argument is optional, defaults to indices.size(0)
|
# This argument is optional, defaults to indices.size(0)
|
||||||
# There's not much point setting this unless it is != indices.size(0)
|
# There's not much point setting this unless it is != indices.size(0)
|
||||||
bound_m: Optional[torch.Tensor] = None
|
bound_m: Optional[torch.Tensor] = None
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
self.a2a.dispatch(
|
self.a2a.dispatch(
|
||||||
out_expert_num_tokens=expert_num_tokens,
|
out_expert_num_tokens=expert_num_tokens,
|
||||||
out_expert_x=expert_x,
|
out_expert_x=expert_x,
|
||||||
@ -207,6 +209,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
indices=topk_ids.view(dtype=torch.uint32),
|
indices=topk_ids.view(dtype=torch.uint32),
|
||||||
bound_m=bound_m,
|
bound_m=bound_m,
|
||||||
)
|
)
|
||||||
|
end = time.perf_counter()
|
||||||
|
logger.info("dispatch took %.3f ms", (end - start) * 1000)
|
||||||
|
|
||||||
if expert_x_scale is not None:
|
if expert_x_scale is not None:
|
||||||
expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
|
expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
|
||||||
@ -248,8 +252,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
if apply_router_weight_on_input:
|
if apply_router_weight_on_input:
|
||||||
topk_weights = torch.ones_like(topk_weights)
|
topk_weights = torch.ones_like(topk_weights)
|
||||||
|
|
||||||
|
start = time.perf_counter()
|
||||||
self.a2a.combine(out_tokens=output,
|
self.a2a.combine(out_tokens=output,
|
||||||
indices=topk_ids.view(dtype=torch.uint32),
|
indices=topk_ids.view(dtype=torch.uint32),
|
||||||
weights=topk_weights,
|
weights=topk_weights,
|
||||||
expert_y=fused_expert_output,
|
expert_y=fused_expert_output,
|
||||||
bound_m=bound_m)
|
bound_m=bound_m)
|
||||||
|
end = time.perf_counter()
|
||||||
|
logger.info("combine took %.3f ms", (end - start) * 1000)
|
||||||
|
|||||||
@ -946,6 +946,7 @@ class DPEngineCoreProc(EngineCoreProc):
|
|||||||
|
|
||||||
# We are in a running state and so must execute a dummy pass
|
# We are in a running state and so must execute a dummy pass
|
||||||
# if the model didn't execute any ready requests.
|
# if the model didn't execute any ready requests.
|
||||||
|
logger.info("Executing dummy batch for wave %d.", self.current_wave)
|
||||||
self.execute_dummy_batch()
|
self.execute_dummy_batch()
|
||||||
|
|
||||||
# 3) All-reduce operation to determine global unfinished reqs.
|
# 3) All-reduce operation to determine global unfinished reqs.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user