mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 11:06:15 +08:00
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com> Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
88 lines
3.6 KiB
Python
88 lines
3.6 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import torch
|
|
|
|
# Fused experts and PrepareFinalize imports
|
|
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
|
BatchedDeepGemmExperts)
|
|
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501
|
|
BatchedTritonOrDeepGemmExperts)
|
|
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
|
|
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
|
|
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
|
BatchedTritonExperts, NaiveBatchedExperts)
|
|
from vllm.model_executor.layers.fused_moe.layer import TritonExperts
|
|
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
|
|
MoEPrepareAndFinalizeNoEP)
|
|
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
|
|
TritonOrDeepGemmExperts)
|
|
from vllm.utils import has_deep_ep, has_pplx
|
|
|
|
if has_deep_ep():
|
|
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
|
|
DeepEPHTPrepareAndFinalize)
|
|
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
|
|
DeepEPLLPrepareAndFinalize)
|
|
|
|
if has_pplx():
|
|
from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
|
|
PplxPrepareAndFinalize)
|
|
|
|
MK_MULTI_GPU_PREPARE_FINALIZE_TYPES = []
|
|
if has_pplx():
|
|
MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [PplxPrepareAndFinalize]
|
|
if has_deep_ep():
|
|
MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [
|
|
DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
|
|
]
|
|
|
|
MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES = [MoEPrepareAndFinalizeNoEP]
|
|
|
|
MK_ALL_PREPARE_FINALIZE_TYPES = (MK_MULTI_GPU_PREPARE_FINALIZE_TYPES +
|
|
MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
|
|
|
|
MK_FUSED_EXPERT_TYPES = [
|
|
BatchedDeepGemmExperts,
|
|
BatchedTritonExperts,
|
|
NaiveBatchedExperts,
|
|
BatchedTritonOrDeepGemmExperts,
|
|
CutlassExpertsFp8,
|
|
DeepGemmExperts,
|
|
TritonOrDeepGemmExperts,
|
|
TritonExperts,
|
|
]
|
|
|
|
MK_QUANT_CONFIGS = [
|
|
None,
|
|
# per-channel / per-column weights and per-tensor activations
|
|
FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
|
|
per_out_ch_quant=True,
|
|
per_act_token_quant=False,
|
|
block_shape=None),
|
|
# per-channel / per-column weights and per-token activations
|
|
FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
|
|
per_out_ch_quant=True,
|
|
per_act_token_quant=True,
|
|
block_shape=None),
|
|
# per-tensor weights and per-tensor activations
|
|
FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
|
|
per_out_ch_quant=False,
|
|
per_act_token_quant=False,
|
|
block_shape=None),
|
|
# per-tensor weights and per-token activations
|
|
FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
|
|
per_out_ch_quant=False,
|
|
per_act_token_quant=True,
|
|
block_shape=None),
|
|
# block-quantized weights and 128 block per-token activations
|
|
FusedMoEQuantConfig(quant_dtype=torch.float8_e4m3fn,
|
|
per_out_ch_quant=False,
|
|
per_act_token_quant=False,
|
|
block_shape=[128, 128]),
|
|
# TODO (varun) : Should we test the following combinations ?
|
|
# block-quantized weights and per-token activations
|
|
# block-quantized weights and per-tensor activations
|
|
]
|