From 6b2ef5cd17c5889557711af6f873282a363df042 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 6 Mar 2025 17:18:06 -0500
Subject: [PATCH] [Bug] Fix Attention when ignored in by quant_method (#14313)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/attention/layer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 7810089a05c7e..3cbd38dbd46a6 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -11,6 +11,7 @@ from vllm.attention import AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
@@ -97,7 +98,8 @@ class Attention(nn.Module):
 
         quant_method = quant_config.get_quant_method(
             self, prefix=prefix) if quant_config else None
-        if quant_method is not None:
+        if quant_method is not None and not isinstance(
+                quant_method, UnquantizedLinearMethod):
             assert isinstance(quant_method, BaseKVCacheMethod)
             # TODO (mgoin): kv cache dtype should be specified in the FP8
             # checkpoint config and become the "auto" behavior