mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-20 22:54:38 +08:00
[Bugfix] Remove dead block_quant_to_tensor_quant function (#31294)
Co-authored-by: yurekami <yurekami@users.noreply.github.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
5d93089686
commit
506eb0f454
@ -15,10 +15,7 @@ from vllm import _custom_ops as ops
|
||||
from vllm._aiter_ops import rocm_aiter_ops
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
GroupShape,
|
||||
group_broadcast,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
CUTLASS_BLOCK_FP8_SUPPORTED,
|
||||
)
|
||||
@ -463,21 +460,6 @@ def input_to_float8(
|
||||
return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
|
||||
|
||||
|
||||
def block_quant_to_tensor_quant(
|
||||
x_q_block: torch.Tensor,
|
||||
x_s: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""This function converts block-wise quantization to tensor-wise
|
||||
quantization. The inputs are block-wise quantization tensor `x_q_block`,
|
||||
block-wise quantization scale and the block size.
|
||||
The outputs are tensor-wise quantization tensor and tensor-wise
|
||||
quantization scale. Note only float8 is supported for now.
|
||||
"""
|
||||
x_dq_block = group_broadcast(x_q_block, x_s)
|
||||
x_q_tensor, scale = input_to_float8(x_dq_block, dtype=x_q_block.dtype)
|
||||
return x_q_tensor, scale
|
||||
|
||||
|
||||
@triton.jit
|
||||
def _per_token_group_quant_fp8(
|
||||
# Pointers to inputs and output
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user