mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-12 15:37:06 +08:00
deep_ep + use_fp8_dispatch
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
This commit is contained in:
parent
4c8f64faa7
commit
8de2fd39fc
@ -45,7 +45,8 @@ if current_platform.is_cuda_alike():
|
|||||||
from .pplx_prepare_finalize import PplxPrepareAndFinalize
|
from .pplx_prepare_finalize import PplxPrepareAndFinalize
|
||||||
if has_deepep:
|
if has_deepep:
|
||||||
from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
|
from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
|
||||||
from .deepep_ll_prepare_finalize import DeepEPLLPrepareAndFinalize
|
from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SIZE,
|
||||||
|
DeepEPLLPrepareAndFinalize)
|
||||||
else:
|
else:
|
||||||
fused_experts = None # type: ignore
|
fused_experts = None # type: ignore
|
||||||
FusedMoEPermuteExpertsUnpermute = None # type: ignore
|
FusedMoEPermuteExpertsUnpermute = None # type: ignore
|
||||||
@ -377,6 +378,12 @@ class FusedMoEMethodBase(QuantizeMethodBase):
|
|||||||
all2all_manager.world_size)
|
all2all_manager.world_size)
|
||||||
handle = all2all_manager.get_handle(all_to_all_args)
|
handle = all2all_manager.get_handle(all_to_all_args)
|
||||||
|
|
||||||
|
# Note : We may want to use FP8 dispatch even otherwise just to
|
||||||
|
# reduce datamovement
|
||||||
|
use_fp8_dispatch = (quant_dtype == current_platform.fp8_dtype()
|
||||||
|
and act_quant_block_size
|
||||||
|
== DEEPEP_QUANT_BLOCK_SIZE)
|
||||||
|
|
||||||
# Note (varun): Whether to use FP8 dispatch or not needs some
|
# Note (varun): Whether to use FP8 dispatch or not needs some
|
||||||
# profiling. Turning it off for now.
|
# profiling. Turning it off for now.
|
||||||
prepare_finalize = DeepEPLLPrepareAndFinalize(
|
prepare_finalize = DeepEPLLPrepareAndFinalize(
|
||||||
@ -386,7 +393,7 @@ class FusedMoEMethodBase(QuantizeMethodBase):
|
|||||||
max_tokens_per_rank=moe.max_num_tokens,
|
max_tokens_per_rank=moe.max_num_tokens,
|
||||||
quant_dtype=quant_dtype,
|
quant_dtype=quant_dtype,
|
||||||
block_shape=act_quant_block_size,
|
block_shape=act_quant_block_size,
|
||||||
use_fp8_dispatch=False,
|
use_fp8_dispatch=use_fp8_dispatch,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.topk_indices_dtype = None
|
self.topk_indices_dtype = None
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user