From decf3e69bcd5cd6c10340a6a45c77dde9a586ae3 Mon Sep 17 00:00:00 2001 From: Wei-Yu Lin Date: Thu, 18 Dec 2025 22:42:02 +0000 Subject: [PATCH] Remove MOE xla implementation Signed-off-by: Wei-Yu Lin --- docs/design/moe_kernel_features.md | 1 - vllm/attention/layers/mm_encoder_attention.py | 25 ------------------- 2 files changed, 26 deletions(-) diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 6c02dcb76bec2..11c6e488f958f 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -92,7 +92,6 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels | gpt oss triton | standard | N/A | N/A | 5 | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],
[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] | | marlin | standard,
batched | 3 / N/A | 3 / N/A | silu,
swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],
[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],
[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | | trtllm | standard | mxfp4,
nvfp4 | G(16),G(32) | 5 | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] | -| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] | | iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] | | rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] | | cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] | diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/attention/layers/mm_encoder_attention.py index 1c1623b13f55a..138fc99114127 100644 --- a/vllm/attention/layers/mm_encoder_attention.py +++ b/vllm/attention/layers/mm_encoder_attention.py @@ -227,28 +227,3 @@ class MMEncoderAttention(CustomOp): "XPU only supports FLASH_ATTN for vision attention." ) return self._forward_fa(query, key, value, cu_seqlens, max_seqlen) - - def forward_tpu( - self, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - cu_seqlens: torch.Tensor | None = None, - max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention - ) -> torch.Tensor: - assert self.attn_backend == AttentionBackendEnum.PALLAS, ( - f"MMEncoderAttention on TPU only supports PALLAS backend, " - f"but got {self.attn_backend}." - ) - if cu_seqlens is None: - query, key, value = (x.transpose(1, 2) for x in (query, key, value)) - from torch_xla.experimental.custom_kernel import flash_attention - - out = flash_attention(query, key, value, sm_scale=self.scale) - out = out.transpose(1, 2) - return out - logger.warning_once( - "PALLAS backend with cu_seqlens is not supported for ViT yet. ", - "Falling back to SDPA implementation.", - ) - return self._forward_sdpa(query, key, value, cu_seqlens)