From 3ca8322b74332ea4914de59b80475aaa5569c46b Mon Sep 17 00:00:00 2001
From: Bill Nell <bnell@redhat.com>
Date: Wed, 28 May 2025 23:37:04 +0000
Subject: [PATCH] lint

Signed-off-by: Bill Nell <bnell@redhat.com>
---
 tests/kernels/moe/test_batched_moe.py         |  6 -----
 tests/kernels/moe/test_pplx_moe.py            |  1 +
 .../layers/fused_moe/fused_batched_moe.py     | 24 +++++--------------
 .../model_executor/layers/quantization/fp8.py |  7 +++---
 4 files changed, 10 insertions(+), 28 deletions(-)

diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 31991d4e680f5..d5fb35324de5f 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -205,13 +205,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
     use_fp8_w8a8 = dtype == torch.torch.float8_e4m3fn
     block_shape = [16, 16, 32]  # 16 for k if not fp8
 
-    #print(f"tensors.A {tensors.A.shape}")
-    #print(f"tensors.B {tensors.B.shape}")
-
     if use_fp8_w8a8:
-        #A_scale = torch.ones((1, K), dtype=torch.float32, device=tensors.A.device)
-        #B_scale = torch.ones((N, K), dtype=torch.float32, device=tensors.A.device)
-        #quant_block_shape = [N, K]
         A_scale = torch.ones(1, dtype=torch.float32, device=tensors.A.device)
         B_scale = torch.ones(1, dtype=torch.float32, device=tensors.B.device)
         quant_block_shape = [1, 1]
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index c10c5ba8127a9..3cae2b0ecfdec 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -63,6 +63,7 @@ requires_pplx = pytest.mark.skipif(
     reason="Requires PPLX kernels",
 )
 
+
 @dataclasses.dataclass
 class ProcessGroupInfo:
     world_size: int
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index c27333f4e704e..bd0489f953ea0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -10,8 +10,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     get_config_dtype_str, try_get_optimal_moe_config)
 from vllm.model_executor.layers.fused_moe.utils import (
-    _resize_cache,
-    moe_kernel_quantize_input)
+    _resize_cache, moe_kernel_quantize_input)
 
 
 @triton.jit
@@ -480,8 +479,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                         self.qtype,
                         self.per_act_token,
                         self.block_shape,
-                    )
-                )
+                    ))
             else:
                 b_a1[idx, :rows, :] = rhs
 
@@ -652,10 +650,8 @@ def batched_moe_kernel_quantize_input(
             if num_tokens > 0:
                 A_q[e, :num_tokens, :], tmp_scale = moe_kernel_quantize_input(
                     A[e, :num_tokens],
-                    A_scale[e, :num_tokens] if A_scale else None,
-                    qtype,
-                    per_channel_quant,
-                    [block_k, block_n])
+                    A_scale[e, :num_tokens] if A_scale else None, qtype,
+                    per_channel_quant, [block_k, block_n])
                 A_q_scale[e, :tmp_scale.shape[0]] = tmp_scale
 
         return A_q, A_q_scale
@@ -812,16 +808,8 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
                         intermediate_cache1.view(-1, N))
 
         qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input(
-            intermediate_cache2,
-            a2_scale,
-            num_tokens,
-            E,
-            N,
-            expert_num_tokens,
-            self.qtype,
-            self.per_act_token,
-            self.block_shape
-        )
+            intermediate_cache2, a2_scale, num_tokens, E, N, expert_num_tokens,
+            self.qtype, self.per_act_token, self.block_shape)
 
         invoke_moe_batched_triton_kernel(A=qintermediate_cache2,
                                          B=w2,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 2fc7c7d7d94f4..35865d5406dbb 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -769,13 +769,12 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             del layer.w2_input_scale
 
     def select_gemm_impl(self, prepare_finalize):
-        from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-            TritonOrDeepGemmExperts)
         from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-            BatchedPrepareAndFinalize,
-            BatchedTritonExperts)
+            BatchedPrepareAndFinalize, BatchedTritonExperts)
         from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
             PplxPrepareAndFinalize)
+        from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+            TritonOrDeepGemmExperts)
 
         assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
             "Marlin and ROCm AITER are not supported with all2all yet.")