diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index df96e5d8c413e..bdeb169a4b97f 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -10,8 +10,6 @@ from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase) -from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( - triton_kernel_moe_forward) from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods @@ -557,6 +555,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): )[0] return trtllm_gen_output else: + from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( # noqa: E501 + triton_kernel_moe_forward) return triton_kernel_moe_forward( hidden_states=x, w1=self.w13_weight_triton_tensor,