# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test ModelOpt quantization method setup and weight loading. Run `pytest tests/quantization/test_modelopt.py`. """ import os from typing import NoReturn import pytest import torch from tests.quantization.utils import is_quant_method_supported @pytest.fixture(scope="function", autouse=True) def enable_pickle(monkeypatch): """`LLM.apply_model` requires pickling a function.""" monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") def _skip(msg: str) -> NoReturn: pytest.skip(msg) raise RuntimeError(msg) def _snapshot_download_or_skip(model_id: str) -> str: try: from huggingface_hub import snapshot_download except Exception as e: # pragma: no cover _skip(f"huggingface_hub is required to download {model_id}: {e}") try: return snapshot_download( repo_id=model_id, repo_type="model", # These checkpoints are already small; download full repo for simplicity. allow_patterns=["*"], ) except Exception as e: _skip(f"Failed to download {model_id} from the HF Hub: {e}") @pytest.mark.skipif( not is_quant_method_supported("modelopt"), reason="ModelOpt FP8 is not supported on this GPU type.", ) def test_modelopt_fp8_checkpoint_setup(vllm_runner): """Test ModelOpt FP8 checkpoint loading and structure validation.""" # TODO: provide a small publicly available test checkpoint model_path = ( "/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/" "TinyLlama-1.1B-Chat-v1.0-fp8-0710" ) # Skip test if checkpoint doesn't exist if not os.path.exists(model_path): pytest.skip( f"Test checkpoint not found at {model_path}. " "This test requires a local ModelOpt FP8 checkpoint." ) with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm: def check_model(model): layer = model.model.layers[0] qkv_proj = layer.self_attn.qkv_proj o_proj = layer.self_attn.o_proj gate_up_proj = layer.mlp.gate_up_proj down_proj = layer.mlp.down_proj # Check that ModelOpt quantization method is properly applied from vllm.model_executor.layers.quantization.modelopt import ( ModelOptFp8LinearMethod, ) assert isinstance(qkv_proj.quant_method, ModelOptFp8LinearMethod) assert isinstance(o_proj.quant_method, ModelOptFp8LinearMethod) assert isinstance(gate_up_proj.quant_method, ModelOptFp8LinearMethod) assert isinstance(down_proj.quant_method, ModelOptFp8LinearMethod) # Check weight dtype is FP8 assert qkv_proj.weight.dtype == torch.float8_e4m3fn assert o_proj.weight.dtype == torch.float8_e4m3fn assert gate_up_proj.weight.dtype == torch.float8_e4m3fn assert down_proj.weight.dtype == torch.float8_e4m3fn # Check scales are present and have correct dtype assert hasattr(qkv_proj, "weight_scale") assert hasattr(qkv_proj, "input_scale") assert qkv_proj.weight_scale.dtype == torch.float32 assert qkv_proj.input_scale.dtype == torch.float32 assert hasattr(o_proj, "weight_scale") assert hasattr(o_proj, "input_scale") assert o_proj.weight_scale.dtype == torch.float32 assert o_proj.input_scale.dtype == torch.float32 assert hasattr(gate_up_proj, "weight_scale") assert hasattr(gate_up_proj, "input_scale") assert gate_up_proj.weight_scale.dtype == torch.float32 assert gate_up_proj.input_scale.dtype == torch.float32 assert hasattr(down_proj, "weight_scale") assert hasattr(down_proj, "input_scale") assert down_proj.weight_scale.dtype == torch.float32 assert down_proj.input_scale.dtype == torch.float32 llm.apply_model(check_model) # Run a simple generation test to ensure the model works output = llm.generate_greedy(["Hello my name is"], max_tokens=4) assert output print(f"ModelOpt FP8 output: {output}") @pytest.mark.skipif( not is_quant_method_supported("modelopt"), reason="ModelOpt FP8 is not supported on this GPU type.", ) def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner): """Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup.""" model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt" model_path = _snapshot_download_or_skip(model_id) with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm: def check_model(model): layer = model.model.layers[0] qkv_proj = layer.self_attn.qkv_proj o_proj = layer.self_attn.o_proj gate_up_proj = layer.mlp.gate_up_proj down_proj = layer.mlp.down_proj from vllm.model_executor.layers.quantization.modelopt import ( ModelOptFp8PcPtLinearMethod, ) assert isinstance(qkv_proj.quant_method, ModelOptFp8PcPtLinearMethod) assert isinstance(o_proj.quant_method, ModelOptFp8PcPtLinearMethod) assert isinstance(gate_up_proj.quant_method, ModelOptFp8PcPtLinearMethod) assert isinstance(down_proj.quant_method, ModelOptFp8PcPtLinearMethod) assert qkv_proj.weight.dtype == torch.float8_e4m3fn assert o_proj.weight.dtype == torch.float8_e4m3fn assert gate_up_proj.weight.dtype == torch.float8_e4m3fn assert down_proj.weight.dtype == torch.float8_e4m3fn # Per-channel scales; activations are dynamically scaled per token. assert hasattr(qkv_proj, "weight_scale") assert qkv_proj.weight_scale.dtype == torch.float32 assert qkv_proj.weight_scale.dim() == 1 assert not hasattr(qkv_proj, "input_scale") assert hasattr(o_proj, "weight_scale") assert o_proj.weight_scale.dtype == torch.float32 assert o_proj.weight_scale.dim() == 1 assert not hasattr(o_proj, "input_scale") assert hasattr(gate_up_proj, "weight_scale") assert gate_up_proj.weight_scale.dtype == torch.float32 assert gate_up_proj.weight_scale.dim() == 1 assert not hasattr(gate_up_proj, "input_scale") assert hasattr(down_proj, "weight_scale") assert down_proj.weight_scale.dtype == torch.float32 assert down_proj.weight_scale.dim() == 1 assert not hasattr(down_proj, "input_scale") llm.apply_model(check_model) output = llm.generate_greedy(["Hello my name is"], max_tokens=4) assert output print(f"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output: {output}") @pytest.mark.skipif( not is_quant_method_supported("modelopt"), reason="ModelOpt FP8 is not supported on this GPU type.", ) def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner): """Test ModelOpt FP8_PB_WO checkpoint setup.""" model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo" model_path = _snapshot_download_or_skip(model_id) with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm: def check_model(model): layer = model.model.layers[0] qkv_proj = layer.self_attn.qkv_proj o_proj = layer.self_attn.o_proj gate_up_proj = layer.mlp.gate_up_proj down_proj = layer.mlp.down_proj from vllm.model_executor.layers.quantization.modelopt import ( ModelOptFp8PbWoLinearMethod, ) assert isinstance(qkv_proj.quant_method, ModelOptFp8PbWoLinearMethod) assert isinstance(o_proj.quant_method, ModelOptFp8PbWoLinearMethod) assert isinstance(gate_up_proj.quant_method, ModelOptFp8PbWoLinearMethod) assert isinstance(down_proj.quant_method, ModelOptFp8PbWoLinearMethod) assert qkv_proj.weight.dtype == torch.float8_e4m3fn assert o_proj.weight.dtype == torch.float8_e4m3fn assert gate_up_proj.weight.dtype == torch.float8_e4m3fn assert down_proj.weight.dtype == torch.float8_e4m3fn # Block scales; should be materialized as a 2D [out_blk, in_blk] tensor. assert hasattr(qkv_proj, "weight_scale") assert qkv_proj.weight_scale.dtype == torch.float32 assert qkv_proj.weight_scale.dim() == 2 assert hasattr(o_proj, "weight_scale") assert o_proj.weight_scale.dtype == torch.float32 assert o_proj.weight_scale.dim() == 2 assert hasattr(gate_up_proj, "weight_scale") assert gate_up_proj.weight_scale.dtype == torch.float32 assert gate_up_proj.weight_scale.dim() == 2 assert hasattr(down_proj, "weight_scale") assert down_proj.weight_scale.dtype == torch.float32 assert down_proj.weight_scale.dim() == 2 llm.apply_model(check_model) output = llm.generate_greedy(["Hello my name is"], max_tokens=4) assert output print(f"ModelOpt FP8_PB_WO output: {output}")