From 769f0918ea73835452f78cda718cbf39d9eaebc1 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Thu, 18 Dec 2025 13:57:58 -0800
Subject: [PATCH 1/7] bump flashinfer to v0.6.0rc1

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 docker/Dockerfile               | 3 ++-
 docker/Dockerfile.nightly_torch | 5 ++---
 requirements/cuda.txt           | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index e61021b6eeb85..b76c51cadce85 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -466,9 +466,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install FlashInfer pre-compiled kernel cache and binaries
 # This is ~1.1GB and only changes when FlashInfer version bumps
 # https://docs.flashinfer.ai/installation.html
-ARG FLASHINFER_VERSION=0.5.3
+ARG FLASHINFER_VERSION=0.6.0rc1
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
+        --extra-index-url https://flashinfer.ai/whl \
     && uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
         --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
     && flashinfer show-config
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index d663c82c3885e..c5a0b8fd365ac 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -213,15 +213,14 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
 
 
 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.5.2
+# release version: v0.6.0rc1
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
     echo "git clone flashinfer..." \
-    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && git clone --depth 1 --branch v0.6.0rc1 --recursive https://github.com/flashinfer-ai/flashinfer.git \
     && cd flashinfer \
-    && git checkout v0.5.2 \
     && git submodule update --init --recursive \
     && echo "finish git clone flashinfer..." \
     && rm -rf build \
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 1417fb99120bc..11a34df677fd5 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -10,4 +10,4 @@ torchaudio==2.9.1
 # These must be updated alongside torch
 torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.5.3
+flashinfer-python==0.6.0rc1

From 899a3b04d18999bc0baa547eb0f6b07e536e9f95 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Thu, 18 Dec 2025 14:11:42 -0800
Subject: [PATCH 2/7] remove tile_tokens_dim for trtllm moe

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 tests/kernels/moe/test_ocp_mx_moe.py          | 26 -------------------
 .../layers/fused_moe/flashinfer_trtllm_moe.py |  7 -----
 .../layers/fused_moe/trtllm_moe.py            |  1 -
 .../layers/quantization/mxfp4.py              |  3 +--
 .../quantization/utils/flashinfer_fp4_moe.py  |  2 --
 .../quantization/utils/flashinfer_utils.py    | 24 -----------------
 6 files changed, 1 insertion(+), 62 deletions(-)

diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py
index 8fe471d124f43..c9b2b85f004ac 100644
--- a/tests/kernels/moe/test_ocp_mx_moe.py
+++ b/tests/kernels/moe/test_ocp_mx_moe.py
@@ -30,7 +30,6 @@ if TRTLLM_GEN_MXFP4_AVAILABLE:
     from flashinfer import (
         fp4_quantize,
         mxfp8_quantize,
-        next_positive_power_of_2,
         reorder_rows_for_gated_act_gemm,
         shuffle_matrix_a,
         shuffle_matrix_sf_a,
@@ -188,30 +187,6 @@ def reference_moe(
     return t.to(torch.bfloat16)
 
 
-def get_tile_tokens_dim(x: torch.Tensor, top_k: int, num_experts: int):
-    # Number of tokens in the input tensor.
-    num_tokens = x.shape[0]
-    # Factor to account for the imbalance of the experts.
-    # factor equals to the
-    # max_real_num_tokens_per_expert / perfect_num_tokens_per_expert
-    # - 1.0 means perfect expert distribution.
-    # - > 1.0 means some experts have more
-    #     tokens than the perfect distribution.
-    # - < 1.0 does not make sense.
-    imbalance_factor = 1.3
-    # Calculate the number of tokens per expert
-    # assuming perfect distribution.
-    num_tokens_per_expert = (num_tokens * top_k) // num_experts
-    # Apply the imbalance factor.
-    num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
-    # And pad the number to the next power of 2.
-    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
-    # Cap to 8-64 tokens per CTA tile
-    # as it's the range supported by the kernel.
-    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
-    return tile_tokens_dim
-
-
 def tg_mxfp4_moe(
     router_logits,
     topk,
@@ -460,7 +435,6 @@ def tg_mxfp4_moe(
         local_expert_offset=0,
         local_num_experts=num_experts,
         routed_scaling_factor=None,
-        tile_tokens_dim=get_tile_tokens_dim(hidden_states, topk, num_experts),
         routing_method_type=1,  # renormalize
         do_finalize=True,
     )[0]
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
index 51e06ac54f497..d14300443c814 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py
@@ -5,9 +5,6 @@ import torch
 
 from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
-from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    calculate_tile_tokens_dim,
-)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8,
 )
@@ -63,7 +60,6 @@ def flashinfer_fused_moe_blockscale_fp8(
         local_expert_offset=expert_offset,
         local_num_experts=local_num_experts,
         routed_scaling_factor=routed_scaling,
-        tile_tokens_dim=None,
         routing_method_type=routing_method_type,
         use_shuffled_weight=False,
     )
@@ -151,9 +147,6 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
         local_num_experts=local_num_experts,
         routed_scaling_factor=routed_scaling_factor,
         use_routing_scales_on_input=use_routing_scales_on_input,
-        tile_tokens_dim=calculate_tile_tokens_dim(
-            hidden_states.shape[0], top_k, num_experts
-        ),
         routing_method_type=routing_method_type,
     )
 
diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
index 132d35e65aba8..4923d96af8dee 100644
--- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
@@ -123,7 +123,6 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
             "local_expert_offset": local_expert_offset,
             "local_num_experts": local_num_experts,
             "routed_scaling_factor": None,
-            "tile_tokens_dim": None,
             "routing_method_type": 1,
             "do_finalize": True,
             "output": output,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 832925825c453..c50753270b86e 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -977,8 +977,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                 self.intermediate_size,  # padded to multiple of 256
                 layer.ep_rank * layer.local_num_experts,  # local_expert_offset
                 self.num_experts,  # local num experts
-                None,
-                None,
+                None,  # routed_scaling_factor
                 1 if layer.renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
                 tune_max_num_tokens=max(self.max_capture_size, 1),
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 1d410316d6299..4611b83757a69 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -325,7 +325,6 @@ def flashinfer_trtllm_fp4_moe(
         local_expert_offset=layer.ep_rank * layer.local_num_experts,
         local_num_experts=layer.local_num_experts,
         routed_scaling_factor=None,
-        tile_tokens_dim=None,
         routing_method_type=routing_method_type,
         do_finalize=True,
     )[0]
@@ -404,7 +403,6 @@ def flashinfer_trtllm_fp4_routed_moe(
         local_expert_offset=layer.ep_rank * layer.local_num_experts,
         local_num_experts=layer.local_num_experts,
         routed_scaling_factor=None,
-        tile_tokens_dim=None,
         routing_method_type=1,
         do_finalize=True,
     )[0]
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 3d6e9cda87667..e87f87c3656a7 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -28,30 +28,6 @@ class FlashinferMoeBackend(Enum):
     CUTEDSL = "CUTEDSL"
 
 
-def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
-    from flashinfer import next_positive_power_of_2
-
-    # FlashInfer 0.2.10 has issues with larger tile sizes. Set to 8 for now.
-    # TODO: Revert this to dynamic calculation once a new version of FlashInfer
-    # with the necessary kernels is released.
-    tile_tokens_dim = 8
-
-    # A factor considering tokens are not perfectly balanced among experts.
-    imbalance_factor = 1.3
-    # Calculate the number of tokens per expert
-    # assuming perfect distribution.
-    num_tokens_per_expert = (num_tokens * top_k) // num_experts
-    # Apply the imbalance factor.
-    num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
-    # And pad the number to the next power of 2.
-    tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
-    # Cap to 8-max_tile_tokens_dim tokens per CTA tile
-    # as it's the range supported by the kernel.
-    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
-
-    return tile_tokens_dim
-
-
 def swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor:
     return (
         x.reshape(-1, 2, x.shape[-2] // 2, x.shape[-1]).flip(dims=[1]).reshape(x.shape)

From 4f1aa77ff919286fb87f057ec7eb29499c5020a3 Mon Sep 17 00:00:00 2001
From: Pavani Majety <pavanimajety@gmail.com>
Date: Mon, 22 Dec 2025 10:55:19 -0800
Subject: [PATCH 3/7] Update docker/Dockerfile

Signed-off-by: Pavani Majety <pavanimajety@gmail.com>
---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index b76c51cadce85..d83e312394c66 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -466,7 +466,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install FlashInfer pre-compiled kernel cache and binaries
 # This is ~1.1GB and only changes when FlashInfer version bumps
 # https://docs.flashinfer.ai/installation.html
-ARG FLASHINFER_VERSION=0.6.0rc1
+ARG FLASHINFER_VERSION=0.6.0rc2
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system flashinfer-cubin==${FLASHINFER_VERSION} \
         --extra-index-url https://flashinfer.ai/whl \

From 3ded8c76bc28647be84350af98c1481a329dee86 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Tue, 23 Dec 2025 09:31:05 +0800
Subject: [PATCH 4/7] Update to v0.6.0rc2

Co-authored-by: Pavani Majety <pavanimajety@gmail.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 docker/Dockerfile.nightly_torch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index c5a0b8fd365ac..25970f40c3c45 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -213,7 +213,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
 
 
 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.6.0rc1
+# release version: v0.6.0rc2
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \

From c14a09882a4f2319e396e2647c8777646755f787 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Tue, 23 Dec 2025 09:31:18 +0800
Subject: [PATCH 5/7] Update to v0.6.0rc2

Co-authored-by: Pavani Majety <pavanimajety@gmail.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 docker/Dockerfile.nightly_torch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index 25970f40c3c45..a8cc1d68d3c27 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -219,7 +219,7 @@ ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
     echo "git clone flashinfer..." \
-    && git clone --depth 1 --branch v0.6.0rc1 --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && git clone --depth 1 --branch v0.6.0rc2 --recursive https://github.com/flashinfer-ai/flashinfer.git \
     && cd flashinfer \
     && git submodule update --init --recursive \
     && echo "finish git clone flashinfer..." \

From 54877a1b50babfcd9b4364c54b401a9a7a5a1f64 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Tue, 23 Dec 2025 09:31:27 +0800
Subject: [PATCH 6/7] Update to v0.6.0rc2

Co-authored-by: Pavani Majety <pavanimajety@gmail.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 requirements/cuda.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 11a34df677fd5..09d0b4c6bb398 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -10,4 +10,4 @@ torchaudio==2.9.1
 # These must be updated alongside torch
 torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.6.0rc1
+flashinfer-python==0.6.0rc2

From 97e86fd55922e3b510ac5014387c3bc2096da065 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Tue, 23 Dec 2025 17:52:53 -0800
Subject: [PATCH 7/7] remove tile_tokens_dim

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
---
 .../layers/quantization/utils/flashinfer_fp4_moe.py              | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index dceee42f31e39..4611b83757a69 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -325,7 +325,6 @@ def flashinfer_trtllm_fp4_moe(
         local_expert_offset=layer.ep_rank * layer.local_num_experts,
         local_num_experts=layer.local_num_experts,
         routed_scaling_factor=None,
-        tile_tokens_dim=None,
         routing_method_type=routing_method_type,
         do_finalize=True,
     )[0]