From 3ca8322b74332ea4914de59b80475aaa5569c46b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 28 May 2025 23:37:04 +0000 Subject: [PATCH] lint Signed-off-by: Bill Nell --- tests/kernels/moe/test_batched_moe.py | 6 ----- tests/kernels/moe/test_pplx_moe.py | 1 + .../layers/fused_moe/fused_batched_moe.py | 24 +++++-------------- .../model_executor/layers/quantization/fp8.py | 7 +++--- 4 files changed, 10 insertions(+), 28 deletions(-) diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 31991d4e680f5..d5fb35324de5f 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -205,13 +205,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, use_fp8_w8a8 = dtype == torch.torch.float8_e4m3fn block_shape = [16, 16, 32] # 16 for k if not fp8 - #print(f"tensors.A {tensors.A.shape}") - #print(f"tensors.B {tensors.B.shape}") - if use_fp8_w8a8: - #A_scale = torch.ones((1, K), dtype=torch.float32, device=tensors.A.device) - #B_scale = torch.ones((N, K), dtype=torch.float32, device=tensors.A.device) - #quant_block_shape = [N, K] A_scale = torch.ones(1, dtype=torch.float32, device=tensors.A.device) B_scale = torch.ones(1, dtype=torch.float32, device=tensors.B.device) quant_block_shape = [1, 1] diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index c10c5ba8127a9..3cae2b0ecfdec 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -63,6 +63,7 @@ requires_pplx = pytest.mark.skipif( reason="Requires PPLX kernels", ) + @dataclasses.dataclass class ProcessGroupInfo: world_size: int diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index c27333f4e704e..bd0489f953ea0 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -10,8 +10,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.fused_moe import ( get_config_dtype_str, try_get_optimal_moe_config) from vllm.model_executor.layers.fused_moe.utils import ( - _resize_cache, - moe_kernel_quantize_input) + _resize_cache, moe_kernel_quantize_input) @triton.jit @@ -480,8 +479,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): self.qtype, self.per_act_token, self.block_shape, - ) - ) + )) else: b_a1[idx, :rows, :] = rhs @@ -652,10 +650,8 @@ def batched_moe_kernel_quantize_input( if num_tokens > 0: A_q[e, :num_tokens, :], tmp_scale = moe_kernel_quantize_input( A[e, :num_tokens], - A_scale[e, :num_tokens] if A_scale else None, - qtype, - per_channel_quant, - [block_k, block_n]) + A_scale[e, :num_tokens] if A_scale else None, qtype, + per_channel_quant, [block_k, block_n]) A_q_scale[e, :tmp_scale.shape[0]] = tmp_scale return A_q, A_q_scale @@ -812,16 +808,8 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): intermediate_cache1.view(-1, N)) qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input( - intermediate_cache2, - a2_scale, - num_tokens, - E, - N, - expert_num_tokens, - self.qtype, - self.per_act_token, - self.block_shape - ) + intermediate_cache2, a2_scale, num_tokens, E, N, expert_num_tokens, + self.qtype, self.per_act_token, self.block_shape) invoke_moe_batched_triton_kernel(A=qintermediate_cache2, B=w2, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 2fc7c7d7d94f4..35865d5406dbb 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -769,13 +769,12 @@ class Fp8MoEMethod(FusedMoEMethodBase): del layer.w2_input_scale def select_gemm_impl(self, prepare_finalize): - from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( - TritonOrDeepGemmExperts) from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( - BatchedPrepareAndFinalize, - BatchedTritonExperts) + BatchedPrepareAndFinalize, BatchedTritonExperts) from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( PplxPrepareAndFinalize) + from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( + TritonOrDeepGemmExperts) assert not self.use_marlin and not self.rocm_aiter_moe_enabled, ( "Marlin and ROCm AITER are not supported with all2all yet.")