diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index d42ae22c5139..5ad1b15b7ed5 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -66,6 +66,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): output_size_per_partition = sum(output_partition_sizes) layer.logical_widths = output_partition_sizes layer.weight_block_size = None + layer.orig_dtype = params_dtype if self.strategy == QuantizationStrategy.BLOCK: assert self.weight_block_size is not None