mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-24 02:17:53 +08:00
[Bugfix] Fix import of CutlassExpertsFp8 in compressed_tensors_moe.py (#20381)
Signed-off-by: Bill Nell <bnell@redhat.com>
This commit is contained in:
parent
9965c47d0d
commit
2e25bb12a8
@ -14,9 +14,9 @@ import vllm.envs as envs
|
|||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.fused_moe import (
|
from vllm.model_executor.layers.fused_moe import (
|
||||||
CutlassExpertsFp8, FusedMoE, FusedMoEActivationFormat, FusedMoEConfig,
|
FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
|
||||||
FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute,
|
FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
|
||||||
FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported, fused_experts)
|
FusedMoeWeightScaleSupported)
|
||||||
from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa
|
from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa
|
||||||
WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
|
WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
|
||||||
from vllm.model_executor.layers.quantization.utils import replace_parameter
|
from vllm.model_executor.layers.quantization.utils import replace_parameter
|
||||||
@ -570,6 +570,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
|||||||
del layer.w2_input_scale
|
del layer.w2_input_scale
|
||||||
self.fused_experts_func = None
|
self.fused_experts_func = None
|
||||||
else:
|
else:
|
||||||
|
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||||
self.fused_experts_func = fused_experts
|
self.fused_experts_func = fused_experts
|
||||||
|
|
||||||
def apply(
|
def apply(
|
||||||
@ -826,6 +827,7 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
|
|||||||
prepare_finalize: FusedMoEPrepareAndFinalize,
|
prepare_finalize: FusedMoEPrepareAndFinalize,
|
||||||
moe: FusedMoEConfig,
|
moe: FusedMoEConfig,
|
||||||
) -> FusedMoEPermuteExpertsUnpermute:
|
) -> FusedMoEPermuteExpertsUnpermute:
|
||||||
|
from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8
|
||||||
|
|
||||||
use_batched_format = (prepare_finalize.activation_format ==
|
use_batched_format = (prepare_finalize.activation_format ==
|
||||||
FusedMoEActivationFormat.BatchedExperts)
|
FusedMoEActivationFormat.BatchedExperts)
|
||||||
|
|||||||
@ -14,10 +14,9 @@ from vllm import _custom_ops as ops
|
|||||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.fused_moe import (
|
from vllm.model_executor.layers.fused_moe import (
|
||||||
BatchedTritonOrDeepGemmExperts, FusedMoE, FusedMoEActivationFormat,
|
FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
|
||||||
FusedMoEConfig, FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute,
|
FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
|
||||||
FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported,
|
FusedMoeWeightScaleSupported)
|
||||||
TritonOrDeepGemmExperts)
|
|
||||||
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
|
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
|
||||||
UnquantizedLinearMethod)
|
UnquantizedLinearMethod)
|
||||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||||
@ -785,6 +784,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
|||||||
prepare_finalize: FusedMoEPrepareAndFinalize,
|
prepare_finalize: FusedMoEPrepareAndFinalize,
|
||||||
moe: FusedMoEConfig,
|
moe: FusedMoEConfig,
|
||||||
) -> FusedMoEPermuteExpertsUnpermute:
|
) -> FusedMoEPermuteExpertsUnpermute:
|
||||||
|
from vllm.model_executor.layers.fused_moe import (
|
||||||
|
BatchedTritonOrDeepGemmExperts, TritonOrDeepGemmExperts)
|
||||||
|
|
||||||
assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
|
assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
|
||||||
"Marlin and ROCm AITER are not supported with all2all yet.")
|
"Marlin and ROCm AITER are not supported with all2all yet.")
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user