From 9dae7d46bfce1b6b6e83e36dd6297c2c9fc58736 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 1 Jul 2025 22:03:43 -0400 Subject: [PATCH] [Refactor] Remove Unused Env `VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON` (#20334) Signed-off-by: yewentao256 --- vllm/envs.py | 7 ------- .../layers/fused_moe/moe_align_block_size.py | 1 - 2 files changed, 8 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index c73dbb0a8446f..0cc6792d72bbd 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -104,7 +104,6 @@ if TYPE_CHECKING: VLLM_SERVER_DEV_MODE: bool = False VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 VLLM_MLA_DISABLE: bool = False - VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False VLLM_RAY_PER_WORKER_GPUS: float = 1.0 VLLM_RAY_BUNDLE_INDICES: str = "" VLLM_CUDART_SO_PATH: Optional[str] = None @@ -769,12 +768,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_MLA_DISABLE": lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), - # If set, vLLM will use the Triton implementation of moe_align_block_size, - # i.e. moe_align_block_size_triton in fused_moe.py. - "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON": - lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0")) - ), - # Number of GPUs per worker in Ray, if it is set to be a fraction, # it allows ray to schedule multiple actors on a single GPU, # so that users can colocate other actors on the same GPUs as vLLM. diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py index ceb96add0fdee..3aae183dfa200 100644 --- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py +++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py @@ -94,7 +94,6 @@ def moe_align_block_size_stage4( # Triton implementation based on: # https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0 -# TODO(wentao): Deprecated this function in the future. def moe_align_block_size_triton( topk_ids: torch.Tensor, num_experts: int,