mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 05:44:59 +08:00
Update Flashinfer to 0.2.14.post1 (#23537)
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com> Signed-off-by: siyuanf <siyuanf@nvidia.com> Signed-off-by: Weiliang Liu <weiliangl@nvidia.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Siyuan Fu <siyuanf@nvidia.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
parent
906e461ed6
commit
ae067888d6
@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
# Install FlashInfer from source
|
# Install FlashInfer from source
|
||||||
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||||
# Keep this in sync with "flashinfer" extra in setup.py
|
# Keep this in sync with "flashinfer" extra in setup.py
|
||||||
ARG FLASHINFER_GIT_REF="v0.2.12"
|
ARG FLASHINFER_GIT_REF="v0.2.14.post1"
|
||||||
# Flag to control whether to compile FlashInfer AOT kernels
|
# Flag to control whether to compile FlashInfer AOT kernels
|
||||||
# Set to "true" to enable AOT compilation:
|
# Set to "true" to enable AOT compilation:
|
||||||
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
|
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
|
||||||
|
|||||||
2
setup.py
2
setup.py
@ -694,7 +694,7 @@ setup(
|
|||||||
"mistral_common[audio]"], # Required for audio processing
|
"mistral_common[audio]"], # Required for audio processing
|
||||||
"video": [], # Kept for backwards compatibility
|
"video": [], # Kept for backwards compatibility
|
||||||
# FlashInfer should be updated together with the Dockerfile
|
# FlashInfer should be updated together with the Dockerfile
|
||||||
"flashinfer": ["flashinfer-python==0.2.12"],
|
"flashinfer": ["flashinfer-python==0.2.14.post1"],
|
||||||
# Optional deps for AMD FP4 quantization support
|
# Optional deps for AMD FP4 quantization support
|
||||||
"petit-kernel": ["petit-kernel"],
|
"petit-kernel": ["petit-kernel"],
|
||||||
},
|
},
|
||||||
|
|||||||
@ -465,7 +465,8 @@ if flashinfer_comm is not None:
|
|||||||
quant_out=quant_out,
|
quant_out=quant_out,
|
||||||
scale_out=scale_out,
|
scale_out=scale_out,
|
||||||
# in vllm we only support swizzled layout
|
# in vllm we only support swizzled layout
|
||||||
layout_code=flashinfer_comm.FP4QuantizationSFLayout.SWIZZLED,
|
layout_code=flashinfer_comm.QuantizationSFLayout.
|
||||||
|
SWIZZLED_128x4,
|
||||||
scale_factor=scale_factor,
|
scale_factor=scale_factor,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -6,6 +6,7 @@ import torch
|
|||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
from vllm import envs
|
from vllm import envs
|
||||||
|
from vllm.config import get_current_vllm_config
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
|
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
|
||||||
FusedMoEMethodBase)
|
FusedMoEMethodBase)
|
||||||
@ -113,6 +114,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
|||||||
self.topk_indices_dtype = None
|
self.topk_indices_dtype = None
|
||||||
self.moe = moe
|
self.moe = moe
|
||||||
self.use_marlin = self._should_use_marlin()
|
self.use_marlin = self._should_use_marlin()
|
||||||
|
self.max_capture_size = get_current_vllm_config(
|
||||||
|
).compilation_config.max_capture_size
|
||||||
|
|
||||||
if current_platform.is_device_capability(100) and not has_flashinfer():
|
if current_platform.is_device_capability(100) and not has_flashinfer():
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
@ -520,7 +523,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
|||||||
x_scale = None
|
x_scale = None
|
||||||
else:
|
else:
|
||||||
x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8
|
x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8
|
||||||
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
|
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
|
||||||
|
*x.shape[:-1], -1)
|
||||||
trtllm_gen_output = trtllm_fp4_block_scale_moe(
|
trtllm_gen_output = trtllm_fp4_block_scale_moe(
|
||||||
router_logits.to(torch.bfloat16),
|
router_logits.to(torch.bfloat16),
|
||||||
None, # routing_bias
|
None, # routing_bias
|
||||||
@ -549,6 +553,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
|||||||
self._get_tile_tokens_dim(x, top_k),
|
self._get_tile_tokens_dim(x, top_k),
|
||||||
1 if renormalize else 0, # routing_method_type, renormalize
|
1 if renormalize else 0, # routing_method_type, renormalize
|
||||||
True, # do finalize
|
True, # do finalize
|
||||||
|
tune_max_num_tokens=self.max_capture_size,
|
||||||
)[0]
|
)[0]
|
||||||
return trtllm_gen_output
|
return trtllm_gen_output
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -310,6 +310,10 @@ class Worker(WorkerBase):
|
|||||||
logger.info("Compile and warming up model for size %d", size)
|
logger.info("Compile and warming up model for size %d", size)
|
||||||
self.model_runner._dummy_run(size, skip_eplb=True)
|
self.model_runner._dummy_run(size, skip_eplb=True)
|
||||||
|
|
||||||
|
# Warmup and tune the kernels used during model execution before
|
||||||
|
# cuda graph capture.
|
||||||
|
kernel_warmup(self)
|
||||||
|
|
||||||
if not self.model_config.enforce_eager:
|
if not self.model_config.enforce_eager:
|
||||||
self.model_runner.capture_model()
|
self.model_runner.capture_model()
|
||||||
|
|
||||||
@ -334,9 +338,6 @@ class Worker(WorkerBase):
|
|||||||
self.model_runner._dummy_sampler_run(
|
self.model_runner._dummy_sampler_run(
|
||||||
hidden_states=last_hidden_states)
|
hidden_states=last_hidden_states)
|
||||||
|
|
||||||
# Warmup kernels used during model execution
|
|
||||||
kernel_warmup(self)
|
|
||||||
|
|
||||||
# Reset the seed to ensure that the random state is not affected by
|
# Reset the seed to ensure that the random state is not affected by
|
||||||
# the model initialization and profiling.
|
# the model initialization and profiling.
|
||||||
set_random_seed(self.model_config.seed)
|
set_random_seed(self.model_config.seed)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user