diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index a250ec89cd5ba..b95ee6450f7ec 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -83,6 +83,9 @@ class PPLXAll2AllManager(All2AllManagerBase): assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels." # noqa super().__init__(cpu_group) + # Intranode doesn't work yet. + self.internode = True + if self.internode: # inter-node communication needs nvshmem, # intra-node communication uses p2p mapping directly diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 9770f02e192d2..feefd9522e730 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -269,9 +269,12 @@ class FusedMoEMethodBase(QuantizeMethodBase): hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else ( (moe.hidden_dim + moe.block_size - 1) // moe.block_size * torch.float32.itemsize)), - group_name=all2all_manager.cpu_group.group_name, ) + if not all2all_manager.internode: + all_to_all_args["group_name"] = \ + all2all_manager.cpu_group.group_name + handle = all2all_manager.get_handle(all_to_all_args) logger.debug("PplxPrepareAndFinalize") diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 35865d5406dbb..0208b241b7a4f 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -790,7 +790,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): max_num_tokens=MOE_DP_CHUNK_SIZE, world_size=all2all_manager.world_size, dp_size=all2all_manager.tp_group.world_size, - qtype=torch.float8_e4m3fn, + use_fp8_w8a8=True, block_shape=self.quant_config.weight_block_size, per_act_token=False, #? )