mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 23:45:01 +08:00
[CI] Enable Blackwell Llama4 MoE tests (#26731)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
1317034379
commit
f8a0acbdbe
@ -529,7 +529,7 @@ steps:
|
|||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
# TODO(jerryzh168): resolve the above comment
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- uv pip install --system torchao==0.13.0
|
- uv pip install --system torchao==0.13.0
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -24,12 +25,21 @@ def set_test_environment():
|
|||||||
os.environ["FLASHINFER_NVCC_THREADS"] = "16"
|
os.environ["FLASHINFER_NVCC_THREADS"] = "16"
|
||||||
|
|
||||||
|
|
||||||
# dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4,
|
# Overide the backbone layers to 4 for faster startup
|
||||||
# "text_config": {"num_layers": 4, "num_hidden_layers": 4}}
|
HF_OVERRIDE_TEXT = {
|
||||||
dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}
|
"num_layers": 4,
|
||||||
|
"num_hidden_layers": 4,
|
||||||
|
}
|
||||||
|
HF_OVERRIDE_MM = {
|
||||||
|
"text_config": {"num_layers": 4, "num_hidden_layers": 4},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def can_initialize(model: str, extra_args: list[str] | None = None):
|
def can_initialize(
|
||||||
|
model: str,
|
||||||
|
hf_overrides: dict[str, Any] | None = None,
|
||||||
|
extra_args: list[str] | None = None,
|
||||||
|
):
|
||||||
# Server arguments
|
# Server arguments
|
||||||
extra_args = extra_args if extra_args is not None else []
|
extra_args = extra_args if extra_args is not None else []
|
||||||
server_args = [
|
server_args = [
|
||||||
@ -50,7 +60,7 @@ def can_initialize(model: str, extra_args: list[str] | None = None):
|
|||||||
model,
|
model,
|
||||||
server_args,
|
server_args,
|
||||||
max_wait_seconds=1500, # Due to FlashInfer compile
|
max_wait_seconds=1500, # Due to FlashInfer compile
|
||||||
override_hf_configs=dummy_hf_overrides,
|
override_hf_configs=hf_overrides,
|
||||||
) as server:
|
) as server:
|
||||||
client = server.get_client()
|
client = server.get_client()
|
||||||
# Make a simple request to verify the server works
|
# Make a simple request to verify the server works
|
||||||
@ -77,28 +87,33 @@ def can_initialize(model: str, extra_args: list[str] | None = None):
|
|||||||
def test_llama4_fp8_tensor_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
def test_llama4_fp8_tensor_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
|
can_initialize(
|
||||||
|
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Works, but takes too long to run")
|
|
||||||
def test_llama4_fp8_tensor_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
def test_llama4_fp8_tensor_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
|
can_initialize(
|
||||||
|
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Works, but takes too long to run")
|
|
||||||
def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
|
can_initialize(
|
||||||
|
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
|
|
||||||
def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||||
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
|
can_initialize(
|
||||||
|
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
## DeepSeekV3 ##
|
## DeepSeekV3 ##
|
||||||
@ -106,7 +121,7 @@ def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
|||||||
|
|
||||||
def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
|
def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
|
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
|
||||||
can_initialize("deepseek-ai/DeepSeek-V3.1")
|
can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(
|
@pytest.mark.skip(
|
||||||
@ -118,26 +133,25 @@ def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
|
|||||||
def test_deepseek_fp8_block_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
def test_deepseek_fp8_block_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||||
can_initialize("deepseek-ai/DeepSeek-V3.1")
|
can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
|
||||||
|
|
||||||
|
|
||||||
def test_deepseek_fp8_block_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
def test_deepseek_fp8_block_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||||
can_initialize("deepseek-ai/DeepSeek-V3.1")
|
can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
|
||||||
|
|
||||||
|
|
||||||
def test_deepseek_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
def test_deepseek_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||||
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
|
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
|
|
||||||
def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||||
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
|
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT)
|
||||||
|
|
||||||
|
|
||||||
## GPT-OSS ##
|
## GPT-OSS ##
|
||||||
@ -145,14 +159,14 @@ def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
|||||||
|
|
||||||
def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
|
def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
|
||||||
can_initialize("openai/gpt-oss-20b")
|
can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
|
||||||
|
|
||||||
|
|
||||||
def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
|
||||||
can_initialize("openai/gpt-oss-20b")
|
can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
|
||||||
|
|
||||||
|
|
||||||
def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
|
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
|
||||||
can_initialize("openai/gpt-oss-20b")
|
can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user