Update Flashinfer to 0.2.14.post1 (#23537)

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: siyuanf <siyuanf@nvidia.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
weiliang 2025-08-26 09:30:44 +08:00 committed by GitHub
parent 906e461ed6
commit ae067888d6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 14 additions and 7 deletions

View File

@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
# Install FlashInfer from source # Install FlashInfer from source
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with "flashinfer" extra in setup.py # Keep this in sync with "flashinfer" extra in setup.py
ARG FLASHINFER_GIT_REF="v0.2.12" ARG FLASHINFER_GIT_REF="v0.2.14.post1"
# Flag to control whether to compile FlashInfer AOT kernels # Flag to control whether to compile FlashInfer AOT kernels
# Set to "true" to enable AOT compilation: # Set to "true" to enable AOT compilation:
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ... # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...

View File

@ -694,7 +694,7 @@ setup(
"mistral_common[audio]"], # Required for audio processing "mistral_common[audio]"], # Required for audio processing
"video": [], # Kept for backwards compatibility "video": [], # Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile # FlashInfer should be updated together with the Dockerfile
"flashinfer": ["flashinfer-python==0.2.12"], "flashinfer": ["flashinfer-python==0.2.14.post1"],
# Optional deps for AMD FP4 quantization support # Optional deps for AMD FP4 quantization support
"petit-kernel": ["petit-kernel"], "petit-kernel": ["petit-kernel"],
}, },

View File

@ -465,7 +465,8 @@ if flashinfer_comm is not None:
quant_out=quant_out, quant_out=quant_out,
scale_out=scale_out, scale_out=scale_out,
# in vllm we only support swizzled layout # in vllm we only support swizzled layout
layout_code=flashinfer_comm.FP4QuantizationSFLayout.SWIZZLED, layout_code=flashinfer_comm.QuantizationSFLayout.
SWIZZLED_128x4,
scale_factor=scale_factor, scale_factor=scale_factor,
) )
else: else:

View File

@ -6,6 +6,7 @@ import torch
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from vllm import envs from vllm import envs
from vllm.config import get_current_vllm_config
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
FusedMoEMethodBase) FusedMoEMethodBase)
@ -113,6 +114,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
self.topk_indices_dtype = None self.topk_indices_dtype = None
self.moe = moe self.moe = moe
self.use_marlin = self._should_use_marlin() self.use_marlin = self._should_use_marlin()
self.max_capture_size = get_current_vllm_config(
).compilation_config.max_capture_size
if current_platform.is_device_capability(100) and not has_flashinfer(): if current_platform.is_device_capability(100) and not has_flashinfer():
logger.warning_once( logger.warning_once(
@ -520,7 +523,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
x_scale = None x_scale = None
else: else:
x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8 x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1) x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
*x.shape[:-1], -1)
trtllm_gen_output = trtllm_fp4_block_scale_moe( trtllm_gen_output = trtllm_fp4_block_scale_moe(
router_logits.to(torch.bfloat16), router_logits.to(torch.bfloat16),
None, # routing_bias None, # routing_bias
@ -549,6 +553,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
self._get_tile_tokens_dim(x, top_k), self._get_tile_tokens_dim(x, top_k),
1 if renormalize else 0, # routing_method_type, renormalize 1 if renormalize else 0, # routing_method_type, renormalize
True, # do finalize True, # do finalize
tune_max_num_tokens=self.max_capture_size,
)[0] )[0]
return trtllm_gen_output return trtllm_gen_output
else: else:

View File

@ -310,6 +310,10 @@ class Worker(WorkerBase):
logger.info("Compile and warming up model for size %d", size) logger.info("Compile and warming up model for size %d", size)
self.model_runner._dummy_run(size, skip_eplb=True) self.model_runner._dummy_run(size, skip_eplb=True)
# Warmup and tune the kernels used during model execution before
# cuda graph capture.
kernel_warmup(self)
if not self.model_config.enforce_eager: if not self.model_config.enforce_eager:
self.model_runner.capture_model() self.model_runner.capture_model()
@ -334,9 +338,6 @@ class Worker(WorkerBase):
self.model_runner._dummy_sampler_run( self.model_runner._dummy_sampler_run(
hidden_states=last_hidden_states) hidden_states=last_hidden_states)
# Warmup kernels used during model execution
kernel_warmup(self)
# Reset the seed to ensure that the random state is not affected by # Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling. # the model initialization and profiling.
set_random_seed(self.model_config.seed) set_random_seed(self.model_config.seed)