flash_infer missing out dtype bug fix

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2026-05-21 19:09:08 +08:00 · 2025-11-03 08:02:45 +00:00 · 2025-11-03 08:02:45 +00:00 · a8010c7b1c
commit a8010c7b1c
parent 7794009661
1 changed files with 1 additions and 1 deletions
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/utils.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/utils.py
@ -17,7 +17,7 @@ def apply_weights_fp8(
    x_s: torch.Tensor,
    bias: torch.Tensor,
    x_s_ub: torch.Tensor | None,
-    maybe_out_dtype: torch.dtype | None,
+    maybe_out_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
    #   ops.scaled_fp8_quant supports both dynamic and static quant.
    #   If dynamic, layer.input_scale is None and x_s computed from x.