mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 00:15:51 +08:00
[Misc] add ignore mapper for quark quantization (#28275)
Signed-off-by: Haoyang Li <lihaoyang0109@gmail.com>
This commit is contained in:
parent
0aecd9138f
commit
0b25498990
@ -2,7 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import fnmatch
|
import fnmatch
|
||||||
from typing import Any, Optional, cast
|
from typing import TYPE_CHECKING, Any, Optional, cast
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -34,6 +34,9 @@ from vllm.model_executor.layers.quantization.quark.utils import (
|
|||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from vllm.model_executor.models.utils import WeightsMapper
|
||||||
|
|
||||||
__all__ = ["QuarkLinearMethod"]
|
__all__ = ["QuarkLinearMethod"]
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -54,6 +57,7 @@ class QuarkConfig(QuantizationConfig):
|
|||||||
self.kv_cache_group = kv_cache_group
|
self.kv_cache_group = kv_cache_group
|
||||||
self.kv_cache_config = kv_cache_config
|
self.kv_cache_config = kv_cache_config
|
||||||
self.pack_method = pack_method
|
self.pack_method = pack_method
|
||||||
|
self.ignore: list[str] = cast(list[str], self.quant_config.get("exclude", []))
|
||||||
|
|
||||||
def get_linear_method(self) -> "QuarkLinearMethod":
|
def get_linear_method(self) -> "QuarkLinearMethod":
|
||||||
return QuarkLinearMethod(self)
|
return QuarkLinearMethod(self)
|
||||||
@ -74,9 +78,8 @@ class QuarkConfig(QuantizationConfig):
|
|||||||
from vllm.attention.layer import Attention # Avoid circular import
|
from vllm.attention.layer import Attention # Avoid circular import
|
||||||
|
|
||||||
# Check if the layer is skipped for quantization.
|
# Check if the layer is skipped for quantization.
|
||||||
exclude_layers = cast(list[str], self.quant_config.get("exclude"))
|
|
||||||
if should_ignore_layer(
|
if should_ignore_layer(
|
||||||
prefix, ignore=exclude_layers, fused_mapping=self.packed_modules_mapping
|
prefix, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
|
||||||
):
|
):
|
||||||
return UnquantizedLinearMethod()
|
return UnquantizedLinearMethod()
|
||||||
if isinstance(layer, LinearBase):
|
if isinstance(layer, LinearBase):
|
||||||
@ -90,6 +93,9 @@ class QuarkConfig(QuantizationConfig):
|
|||||||
return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix)
|
return QuarkMoEMethod.get_moe_method(self, module=layer, layer_name=prefix)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
|
||||||
|
self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_config(cls, config: dict[str, Any]) -> "QuarkConfig":
|
def from_config(cls, config: dict[str, Any]) -> "QuarkConfig":
|
||||||
export_config = config.get("export")
|
export_config = config.get("export")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user