fixes

Signed-off-by: Bill Nell <bnell@redhat.com>
2026-07-12 02:37:08 +08:00 · 2025-05-29 02:08:22 +00:00 · 2025-05-29 02:08:22 +00:00 · caca0b718a
commit caca0b718a
parent d86e3f0172
3 changed files with 8 additions and 2 deletions
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@ -83,6 +83,9 @@ class PPLXAll2AllManager(All2AllManagerBase):
        assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."  # noqa
        super().__init__(cpu_group)

+        # Intranode doesn't work yet.
+        self.internode = True
+
        if self.internode:
            # inter-node communication needs nvshmem,
            # intra-node communication uses p2p mapping directly
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -269,9 +269,12 @@ class FusedMoEMethodBase(QuantizeMethodBase):
                hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else (
                    (moe.hidden_dim + moe.block_size - 1) // moe.block_size *
                    torch.float32.itemsize)),
-                group_name=all2all_manager.cpu_group.group_name,
            )

+            if not all2all_manager.internode:
+                all_to_all_args["group_name"] = \
+                    all2all_manager.cpu_group.group_name
+
            handle = all2all_manager.get_handle(all_to_all_args)

            logger.debug("PplxPrepareAndFinalize")
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@ -790,7 +790,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                max_num_tokens=MOE_DP_CHUNK_SIZE,
                world_size=all2all_manager.world_size,
                dp_size=all2all_manager.tp_group.world_size,
-                qtype=torch.float8_e4m3fn,
+                use_fp8_w8a8=True,
                block_shape=self.quant_config.weight_block_size,
                per_act_token=False,  #?
            )