From c7fc6b1354a20f5dbdd2fb806cd4b7da27d46f63 Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Sun, 24 Aug 2025 15:35:41 -0700 Subject: [PATCH] fix incompatibililty with non cuda platform for nvfp4 (#23478) Signed-off-by: Lu Fang Co-authored-by: Lucia (Lu) Fang --- vllm/compilation/fusion.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index 413948799de3..0d8d562514e3 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -47,8 +47,10 @@ QUANT_OPS: dict[QuantKey, OpOverload] = { torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501 kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501 - kNvfp4Quant: torch.ops._C.scaled_fp4_quant.default, # noqa: E501 } +if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"): + QUANT_OPS[ + kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default # noqa: E501 class FusedRMSQuantKey(NamedTuple):