mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 09:06:01 +08:00
[Quantization] Add compressed-tensors emulations support for NVFP4 (#19879)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com> Signed-off-by: Dipika <dipikasikka1@gmail.com>
This commit is contained in:
parent
e795d723ed
commit
02c97d9a92
@ -133,6 +133,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
|
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
|
||||||
VLLM_KV_CACHE_LAYOUT: Optional[str] = None
|
VLLM_KV_CACHE_LAYOUT: Optional[str] = None
|
||||||
VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
|
VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
|
||||||
|
VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
|
||||||
|
|
||||||
|
|
||||||
def get_default_cache_root():
|
def get_default_cache_root():
|
||||||
@ -918,6 +919,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# or bad hardware but it may add compute overhead.
|
# or bad hardware but it may add compute overhead.
|
||||||
"VLLM_COMPUTE_NANS_IN_LOGITS":
|
"VLLM_COMPUTE_NANS_IN_LOGITS":
|
||||||
lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))),
|
lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))),
|
||||||
|
|
||||||
|
# Controls whether or not emulations are used for NVFP4
|
||||||
|
# generations on machines < 100 for compressed-tensors
|
||||||
|
# models
|
||||||
|
"VLLM_USE_NVFP4_CT_EMULATIONS":
|
||||||
|
lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0")))
|
||||||
}
|
}
|
||||||
|
|
||||||
# --8<-- [end:env-vars-definition]
|
# --8<-- [end:env-vars-definition]
|
||||||
|
|||||||
@ -13,6 +13,7 @@ from compressed_tensors.quantization import (QuantizationArgs,
|
|||||||
QuantizationType)
|
QuantizationType)
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||||
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
|
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
|
||||||
@ -374,7 +375,8 @@ class CompressedTensorsConfig(QuantizationConfig):
|
|||||||
|
|
||||||
if is_activation_quantization_format(self.quant_format):
|
if is_activation_quantization_format(self.quant_format):
|
||||||
if self._is_fp4a4_nvfp4(weight_quant, input_quant):
|
if self._is_fp4a4_nvfp4(weight_quant, input_quant):
|
||||||
if CompressedTensorsW4A4Fp4.cutlass_fp4_supported():
|
if CompressedTensorsW4A4Fp4.cutlass_fp4_supported(
|
||||||
|
) or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
|
||||||
return CompressedTensorsW4A4Fp4()
|
return CompressedTensorsW4A4Fp4()
|
||||||
else:
|
else:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
|
|||||||
@ -4,11 +4,14 @@ from typing import Callable, Optional
|
|||||||
import torch
|
import torch
|
||||||
from torch.nn.parameter import Parameter
|
from torch.nn.parameter import Parameter
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
from vllm._custom_ops import (cutlass_scaled_fp4_mm,
|
from vllm._custom_ops import (cutlass_scaled_fp4_mm,
|
||||||
cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
|
cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
|
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
|
||||||
CompressedTensorsScheme)
|
CompressedTensorsScheme)
|
||||||
|
from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501
|
||||||
|
run_nvfp4_emulations)
|
||||||
from vllm.model_executor.parameter import (GroupQuantScaleParameter,
|
from vllm.model_executor.parameter import (GroupQuantScaleParameter,
|
||||||
ModelWeightParameter,
|
ModelWeightParameter,
|
||||||
PerTensorScaleParameter)
|
PerTensorScaleParameter)
|
||||||
@ -26,6 +29,8 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_min_capability(cls) -> int:
|
def get_min_capability(cls) -> int:
|
||||||
|
if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
|
||||||
|
return 80
|
||||||
return 100
|
return 100
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -129,6 +134,17 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
|
|||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
||||||
|
|
||||||
|
if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
|
||||||
|
out = run_nvfp4_emulations(
|
||||||
|
x=x,
|
||||||
|
input_global_scale=layer.input_global_scale,
|
||||||
|
weight=layer.weight,
|
||||||
|
weight_scale_swizzled=layer.weight_scale_swizzled,
|
||||||
|
weight_global_scale=layer.weight_global_scale)
|
||||||
|
if bias is not None:
|
||||||
|
out = out + bias
|
||||||
|
return out
|
||||||
|
|
||||||
output_dtype = x.dtype
|
output_dtype = x.dtype
|
||||||
output_shape = [x.shape[0], layer.weight.shape[0]]
|
output_shape = [x.shape[0], layer.weight.shape[0]]
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user