mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-26 03:37:52 +08:00
FIX MOE issue in AutoRound format (#18586)
Signed-off-by: wenhuach21 <wenhua.cheng@intel.com>
This commit is contained in:
parent
45ab403a1f
commit
ec82c3e388
@ -58,7 +58,7 @@ vLLM is fast with:
|
|||||||
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
||||||
- Continuous batching of incoming requests
|
- Continuous batching of incoming requests
|
||||||
- Fast model execution with CUDA/HIP graph
|
- Fast model execution with CUDA/HIP graph
|
||||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
|
||||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
||||||
- Speculative decoding
|
- Speculative decoding
|
||||||
- Chunked prefill
|
- Chunked prefill
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import torch
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.linear import (LinearBase,
|
from vllm.model_executor.layers.linear import (LinearBase,
|
||||||
UnquantizedLinearMethod)
|
UnquantizedLinearMethod)
|
||||||
|
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||||
from vllm.model_executor.layers.quantization.base_config import (
|
from vllm.model_executor.layers.quantization.base_config import (
|
||||||
QuantizationConfig)
|
QuantizationConfig)
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||||
@ -74,7 +75,7 @@ class AutoRoundConfig(QuantizationConfig):
|
|||||||
f"group_size={self.group_size}, sym={self.sym})")
|
f"group_size={self.group_size}, sym={self.sym})")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_name(cls): ## use str will trigger preci issue
|
def get_name(cls) -> QuantizationMethods:
|
||||||
return "auto-round"
|
return "auto-round"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -142,18 +143,18 @@ class AutoRoundConfig(QuantizationConfig):
|
|||||||
prefix, layer.__class__.__name__, weight_bits, group_size,
|
prefix, layer.__class__.__name__, weight_bits, group_size,
|
||||||
sym)
|
sym)
|
||||||
if backend == "auto" or "marlin" in backend:
|
if backend == "auto" or "marlin" in backend:
|
||||||
if isinstance(layer, FusedMoE):
|
AWQ_TYPE_MAP = {
|
||||||
use_marlin = check_moe_marlin_supports_layer(layer, group_size)
|
4: scalar_types.uint4,
|
||||||
else:
|
8: scalar_types.uint8,
|
||||||
|
}
|
||||||
|
use_marlin = (weight_bits
|
||||||
|
in AWQ_TYPE_MAP) and check_marlin_supported(
|
||||||
|
AWQ_TYPE_MAP[weight_bits], group_size, not sym)
|
||||||
|
|
||||||
|
if isinstance(layer, FusedMoE):
|
||||||
|
use_marlin = use_marlin and check_moe_marlin_supports_layer(
|
||||||
|
layer, group_size)
|
||||||
|
|
||||||
AWQ_TYPE_MAP = {
|
|
||||||
4: scalar_types.uint4,
|
|
||||||
8: scalar_types.uint8,
|
|
||||||
}
|
|
||||||
use_marlin = ((weight_bits, sym) in AWQ_TYPE_MAP
|
|
||||||
and check_marlin_supported(
|
|
||||||
AWQ_TYPE_MAP[(weight_bits)], group_size,
|
|
||||||
not sym))
|
|
||||||
else:
|
else:
|
||||||
use_marlin = False
|
use_marlin = False
|
||||||
if use_marlin:
|
if use_marlin:
|
||||||
@ -180,10 +181,11 @@ class AutoRoundConfig(QuantizationConfig):
|
|||||||
from vllm.model_executor.layers.quantization.moe_wna16 import (
|
from vllm.model_executor.layers.quantization.moe_wna16 import (
|
||||||
MoeWNA16Config)
|
MoeWNA16Config)
|
||||||
config = {
|
config = {
|
||||||
"linear_quant_method": "awq",
|
"quant_method": "awq",
|
||||||
"weight_bits": weight_bits,
|
"bits": weight_bits,
|
||||||
"group_size": group_size,
|
"group_size": group_size,
|
||||||
"zero_point": not sym,
|
"zero_point": not sym,
|
||||||
|
"lm_head": False,
|
||||||
}
|
}
|
||||||
return MoeWNA16Config.from_config(config).get_quant_method(
|
return MoeWNA16Config.from_config(config).get_quant_method(
|
||||||
layer, prefix)
|
layer, prefix)
|
||||||
@ -213,18 +215,18 @@ class AutoRoundConfig(QuantizationConfig):
|
|||||||
prefix, layer.__class__.__name__, weight_bits, group_size,
|
prefix, layer.__class__.__name__, weight_bits, group_size,
|
||||||
sym)
|
sym)
|
||||||
if backend == "auto" or "marlin" in backend:
|
if backend == "auto" or "marlin" in backend:
|
||||||
|
GPTQ_TYPE_MAP = {
|
||||||
|
(4, True): scalar_types.uint4b8,
|
||||||
|
(8, True): scalar_types.uint8b128,
|
||||||
|
}
|
||||||
|
use_marlin = ((weight_bits, sym) in GPTQ_TYPE_MAP
|
||||||
|
and check_marlin_supported(
|
||||||
|
GPTQ_TYPE_MAP[(weight_bits, sym)],
|
||||||
|
group_size,
|
||||||
|
has_zp=not sym))
|
||||||
if isinstance(layer, FusedMoE):
|
if isinstance(layer, FusedMoE):
|
||||||
use_marlin = check_moe_marlin_supports_layer(layer, group_size)
|
use_marlin = use_marlin and check_moe_marlin_supports_layer(
|
||||||
else:
|
layer, group_size)
|
||||||
GPTQ_TYPE_MAP = {
|
|
||||||
(4, True): scalar_types.uint4b8,
|
|
||||||
(8, True): scalar_types.uint8b128,
|
|
||||||
}
|
|
||||||
use_marlin = ((weight_bits, sym) in GPTQ_TYPE_MAP
|
|
||||||
and check_marlin_supported(
|
|
||||||
GPTQ_TYPE_MAP[(weight_bits, sym)],
|
|
||||||
group_size,
|
|
||||||
has_zp=not sym))
|
|
||||||
else:
|
else:
|
||||||
use_marlin = False
|
use_marlin = False
|
||||||
if use_marlin:
|
if use_marlin:
|
||||||
@ -251,11 +253,11 @@ class AutoRoundConfig(QuantizationConfig):
|
|||||||
from vllm.model_executor.layers.quantization.moe_wna16 import (
|
from vllm.model_executor.layers.quantization.moe_wna16 import (
|
||||||
MoeWNA16Config)
|
MoeWNA16Config)
|
||||||
config = {
|
config = {
|
||||||
"linear_quant_method": "gptq",
|
"quant_method": "gptq",
|
||||||
"weight_bits": weight_bits,
|
"bits": weight_bits,
|
||||||
"group_size": group_size,
|
"group_size": group_size,
|
||||||
"sym": sym,
|
"sym": sym,
|
||||||
"lm_head_quantized": False,
|
"lm_head": False,
|
||||||
}
|
}
|
||||||
return MoeWNA16Config.from_config(config).get_quant_method(
|
return MoeWNA16Config.from_config(config).get_quant_method(
|
||||||
layer, prefix)
|
layer, prefix)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user