mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 08:04:58 +08:00
Signed-off-by: Felix Marty <felmarty@amd.com> Signed-off-by: Bowen Bao <bowenbao@amd.com> Signed-off-by: Felix Marty <Felix.Marty@amd.com> Co-authored-by: Bowen Bao <bowenbao@amd.com>
57 lines
1.9 KiB
Python
57 lines
1.9 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import importlib
|
|
import importlib.metadata
|
|
from dataclasses import dataclass
|
|
|
|
import pytest
|
|
import torch
|
|
from packaging import version
|
|
|
|
QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
|
|
"quark") is not None and version.parse(
|
|
importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
|
|
|
|
|
|
@dataclass
|
|
class ModelCase:
|
|
model_id: str
|
|
tp: int
|
|
|
|
|
|
@pytest.mark.parametrize('model_case', [
|
|
ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
|
|
ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
|
|
ModelCase("fxmarty/Llama-4-Scout-17B-16E-Instruct-2-layers-mxfp4", tp=1)
|
|
])
|
|
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
|
|
reason="amd-quark>=0.9 is not available")
|
|
def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
|
|
if torch.cuda.device_count() < model_case.tp:
|
|
pytest.skip(f"This test requires >={model_case.tp} gpus, got only "
|
|
f"{torch.cuda.device_count()}")
|
|
|
|
with vllm_runner(model_case.model_id,
|
|
tensor_parallel_size=model_case.tp,
|
|
load_format="dummy") as llm:
|
|
|
|
# TODO: llm.apply_model(check_model) currently relies on V0 internals.
|
|
# Re-enable this later.
|
|
# def check_model(model):
|
|
# layer = model.model.layers[0]
|
|
|
|
# qkv_proj = layer.self_attn.qkv_proj
|
|
|
|
# assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
|
|
# assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
|
|
|
|
# assert isinstance(layer.mlp.experts.quant_method,
|
|
# QuarkW4A4MXFp4MoEMethod)
|
|
|
|
# if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
|
|
# llm.apply_model(check_model)
|
|
|
|
output = llm.generate_greedy("Today I am in the French Alps and",
|
|
max_tokens=20)
|
|
assert output |