mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-23 19:27:07 +08:00
updated
Signed-off-by: Robert Shaw <robshaw@redhat.com>
This commit is contained in:
parent
6e2c176e1f
commit
270d05d9fd
@ -595,6 +595,13 @@ def main(args: argparse.Namespace):
|
||||
intermediate_size = config.intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
|
||||
# Expert parallelism
|
||||
if E % args.ep_size != 0:
|
||||
raise ValueError(
|
||||
f"Number of experts {E} must be divisible by expert parallel size {args.ep_size}"
|
||||
)
|
||||
E = E // args.ep_size
|
||||
|
||||
hidden_size = config.hidden_size
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
@ -724,7 +731,10 @@ if __name__ == "__main__":
|
||||
"--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2
|
||||
"--tp-size", "-tp", "--tensor-parallel-size", type=int, default=1
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ep-size", "-ep", "--expert-parallel-size", type=int, default=1
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
|
||||
|
||||
@ -11,7 +11,7 @@ if [ ! -d "$WORKSPACE" ]; then
|
||||
fi
|
||||
|
||||
# install dependencies if not installed
|
||||
pip3 install cmake torch ninja
|
||||
uv pip install cmake torch ninja
|
||||
|
||||
# build nvshmem
|
||||
pushd $WORKSPACE
|
||||
@ -59,7 +59,7 @@ git clone https://github.com/ppl-ai/pplx-kernels
|
||||
cd pplx-kernels
|
||||
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
|
||||
# PIP_NO_BUILD_ISOLATION=0 disables build isolation
|
||||
PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e .
|
||||
PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX uv pip install -vvv -e .
|
||||
popd
|
||||
|
||||
# build and install deepep, require pytorch installed
|
||||
@ -67,5 +67,5 @@ pushd $WORKSPACE
|
||||
git clone https://github.com/deepseek-ai/DeepEP
|
||||
cd DeepEP
|
||||
export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
|
||||
PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e .
|
||||
PIP_NO_BUILD_ISOLATION=0 uv pip install -vvv -e .
|
||||
popd
|
||||
|
||||
@ -197,14 +197,13 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
# This argument is optional, defaults to indices.size(0)
|
||||
# There's not much point setting this unless it is != indices.size(0)
|
||||
bound_m: Optional[torch.Tensor] = None
|
||||
|
||||
self.a2a.dispatch(
|
||||
out_expert_num_tokens=expert_num_tokens,
|
||||
out_expert_x=expert_x,
|
||||
out_expert_x_scale=expert_x_scale,
|
||||
dp_x=a1q,
|
||||
dp_x_scale=a1q_scale,
|
||||
indices=topk_ids,
|
||||
indices=topk_ids.view(dtype=torch.uint32),
|
||||
bound_m=bound_m,
|
||||
)
|
||||
|
||||
@ -249,7 +248,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
topk_weights = torch.ones_like(topk_weights)
|
||||
|
||||
self.a2a.combine(out_tokens=output,
|
||||
indices=topk_ids,
|
||||
indices=topk_ids.view(dtype=torch.uint32),
|
||||
weights=topk_weights,
|
||||
expert_y=fused_expert_output,
|
||||
bound_m=bound_m)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user