From a8010c7b1c83aa884a3212925c442d37204fb14e Mon Sep 17 00:00:00 2001 From: vllmellm Date: Mon, 3 Nov 2025 08:02:45 +0000 Subject: [PATCH] flash_infer missing out dtype bug fix Signed-off-by: vllmellm --- .../layers/quantization/kernels/scaled_mm/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/utils.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/utils.py index 62bbacbc782cd..e5ab5ad4d47cf 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/utils.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/utils.py @@ -17,7 +17,7 @@ def apply_weights_fp8( x_s: torch.Tensor, bias: torch.Tensor, x_s_ub: torch.Tensor | None, - maybe_out_dtype: torch.dtype | None, + maybe_out_dtype: torch.dtype | None = None, ) -> torch.Tensor: # ops.scaled_fp8_quant supports both dynamic and static quant. # If dynamic, layer.input_scale is None and x_s computed from x.