From 6b2ef5cd17c5889557711af6f873282a363df042 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 6 Mar 2025 17:18:06 -0500 Subject: [PATCH] [Bug] Fix Attention when ignored in by quant_method (#14313) Signed-off-by: mgoin --- vllm/attention/layer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 7810089a05c7e..3cbd38dbd46a6 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -11,6 +11,7 @@ from vllm.attention import AttentionType from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.config import CacheConfig, get_current_vllm_config from vllm.forward_context import ForwardContext, get_forward_context +from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod @@ -97,7 +98,8 @@ class Attention(nn.Module): quant_method = quant_config.get_quant_method( self, prefix=prefix) if quant_config else None - if quant_method is not None: + if quant_method is not None and not isinstance( + quant_method, UnquantizedLinearMethod): assert isinstance(quant_method, BaseKVCacheMethod) # TODO (mgoin): kv cache dtype should be specified in the FP8 # checkpoint config and become the "auto" behavior