mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-25 04:07:56 +08:00
remove tile_tokens_dim for trtllm moe
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
This commit is contained in:
parent
769f0918ea
commit
899a3b04d1
@ -30,7 +30,6 @@ if TRTLLM_GEN_MXFP4_AVAILABLE:
|
||||
from flashinfer import (
|
||||
fp4_quantize,
|
||||
mxfp8_quantize,
|
||||
next_positive_power_of_2,
|
||||
reorder_rows_for_gated_act_gemm,
|
||||
shuffle_matrix_a,
|
||||
shuffle_matrix_sf_a,
|
||||
@ -188,30 +187,6 @@ def reference_moe(
|
||||
return t.to(torch.bfloat16)
|
||||
|
||||
|
||||
def get_tile_tokens_dim(x: torch.Tensor, top_k: int, num_experts: int):
|
||||
# Number of tokens in the input tensor.
|
||||
num_tokens = x.shape[0]
|
||||
# Factor to account for the imbalance of the experts.
|
||||
# factor equals to the
|
||||
# max_real_num_tokens_per_expert / perfect_num_tokens_per_expert
|
||||
# - 1.0 means perfect expert distribution.
|
||||
# - > 1.0 means some experts have more
|
||||
# tokens than the perfect distribution.
|
||||
# - < 1.0 does not make sense.
|
||||
imbalance_factor = 1.3
|
||||
# Calculate the number of tokens per expert
|
||||
# assuming perfect distribution.
|
||||
num_tokens_per_expert = (num_tokens * top_k) // num_experts
|
||||
# Apply the imbalance factor.
|
||||
num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
|
||||
# And pad the number to the next power of 2.
|
||||
tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
|
||||
# Cap to 8-64 tokens per CTA tile
|
||||
# as it's the range supported by the kernel.
|
||||
tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
|
||||
return tile_tokens_dim
|
||||
|
||||
|
||||
def tg_mxfp4_moe(
|
||||
router_logits,
|
||||
topk,
|
||||
@ -460,7 +435,6 @@ def tg_mxfp4_moe(
|
||||
local_expert_offset=0,
|
||||
local_num_experts=num_experts,
|
||||
routed_scaling_factor=None,
|
||||
tile_tokens_dim=get_tile_tokens_dim(hidden_states, topk, num_experts),
|
||||
routing_method_type=1, # renormalize
|
||||
do_finalize=True,
|
||||
)[0]
|
||||
|
||||
@ -5,9 +5,6 @@ import torch
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
|
||||
from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
|
||||
from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
|
||||
calculate_tile_tokens_dim,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8,
|
||||
)
|
||||
@ -63,7 +60,6 @@ def flashinfer_fused_moe_blockscale_fp8(
|
||||
local_expert_offset=expert_offset,
|
||||
local_num_experts=local_num_experts,
|
||||
routed_scaling_factor=routed_scaling,
|
||||
tile_tokens_dim=None,
|
||||
routing_method_type=routing_method_type,
|
||||
use_shuffled_weight=False,
|
||||
)
|
||||
@ -151,9 +147,6 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
|
||||
local_num_experts=local_num_experts,
|
||||
routed_scaling_factor=routed_scaling_factor,
|
||||
use_routing_scales_on_input=use_routing_scales_on_input,
|
||||
tile_tokens_dim=calculate_tile_tokens_dim(
|
||||
hidden_states.shape[0], top_k, num_experts
|
||||
),
|
||||
routing_method_type=routing_method_type,
|
||||
)
|
||||
|
||||
|
||||
@ -123,7 +123,6 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
"local_expert_offset": local_expert_offset,
|
||||
"local_num_experts": local_num_experts,
|
||||
"routed_scaling_factor": None,
|
||||
"tile_tokens_dim": None,
|
||||
"routing_method_type": 1,
|
||||
"do_finalize": True,
|
||||
"output": output,
|
||||
|
||||
@ -977,8 +977,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
self.intermediate_size, # padded to multiple of 256
|
||||
layer.ep_rank * layer.local_num_experts, # local_expert_offset
|
||||
self.num_experts, # local num experts
|
||||
None,
|
||||
None,
|
||||
None, # routed_scaling_factor
|
||||
1 if layer.renormalize else 0, # routing_method_type, renormalize
|
||||
True, # do finalize
|
||||
tune_max_num_tokens=max(self.max_capture_size, 1),
|
||||
|
||||
@ -325,7 +325,6 @@ def flashinfer_trtllm_fp4_moe(
|
||||
local_expert_offset=layer.ep_rank * layer.local_num_experts,
|
||||
local_num_experts=layer.local_num_experts,
|
||||
routed_scaling_factor=None,
|
||||
tile_tokens_dim=None,
|
||||
routing_method_type=routing_method_type,
|
||||
do_finalize=True,
|
||||
)[0]
|
||||
@ -404,7 +403,6 @@ def flashinfer_trtllm_fp4_routed_moe(
|
||||
local_expert_offset=layer.ep_rank * layer.local_num_experts,
|
||||
local_num_experts=layer.local_num_experts,
|
||||
routed_scaling_factor=None,
|
||||
tile_tokens_dim=None,
|
||||
routing_method_type=1,
|
||||
do_finalize=True,
|
||||
)[0]
|
||||
|
||||
@ -28,30 +28,6 @@ class FlashinferMoeBackend(Enum):
|
||||
CUTEDSL = "CUTEDSL"
|
||||
|
||||
|
||||
def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
|
||||
from flashinfer import next_positive_power_of_2
|
||||
|
||||
# FlashInfer 0.2.10 has issues with larger tile sizes. Set to 8 for now.
|
||||
# TODO: Revert this to dynamic calculation once a new version of FlashInfer
|
||||
# with the necessary kernels is released.
|
||||
tile_tokens_dim = 8
|
||||
|
||||
# A factor considering tokens are not perfectly balanced among experts.
|
||||
imbalance_factor = 1.3
|
||||
# Calculate the number of tokens per expert
|
||||
# assuming perfect distribution.
|
||||
num_tokens_per_expert = (num_tokens * top_k) // num_experts
|
||||
# Apply the imbalance factor.
|
||||
num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
|
||||
# And pad the number to the next power of 2.
|
||||
tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert)
|
||||
# Cap to 8-max_tile_tokens_dim tokens per CTA tile
|
||||
# as it's the range supported by the kernel.
|
||||
tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
|
||||
|
||||
return tile_tokens_dim
|
||||
|
||||
|
||||
def swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor:
|
||||
return (
|
||||
x.reshape(-1, 2, x.shape[-2] // 2, x.shape[-1]).flip(dims=[1]).reshape(x.shape)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user