From f8a0acbdbeb7751cf2a25f5d5191cf1ae2ce6e22 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 15 Oct 2025 23:02:57 -0400 Subject: [PATCH] [CI] Enable Blackwell Llama4 MoE tests (#26731) Signed-off-by: mgoin --- .buildkite/test-pipeline.yaml | 2 +- tests/quantization/test_blackwell_moe.py | 56 +++++++++++++++--------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a8a5bf3ad234..a476b377ba3b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -529,7 +529,7 @@ steps: # we can only upgrade after this is resolved # TODO(jerryzh168): resolve the above comment - uv pip install --system torchao==0.13.0 - - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - label: LM Eval Small Models # 53min timeout_in_minutes: 75 diff --git a/tests/quantization/test_blackwell_moe.py b/tests/quantization/test_blackwell_moe.py index fc7f73e77c31..3773d1f2afa6 100644 --- a/tests/quantization/test_blackwell_moe.py +++ b/tests/quantization/test_blackwell_moe.py @@ -3,6 +3,7 @@ import json import os +from typing import Any import pytest @@ -24,12 +25,21 @@ def set_test_environment(): os.environ["FLASHINFER_NVCC_THREADS"] = "16" -# dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4, -# "text_config": {"num_layers": 4, "num_hidden_layers": 4}} -dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4} +# Overide the backbone layers to 4 for faster startup +HF_OVERRIDE_TEXT = { + "num_layers": 4, + "num_hidden_layers": 4, +} +HF_OVERRIDE_MM = { + "text_config": {"num_layers": 4, "num_hidden_layers": 4}, +} -def can_initialize(model: str, extra_args: list[str] | None = None): +def can_initialize( + model: str, + hf_overrides: dict[str, Any] | None = None, + extra_args: list[str] | None = None, +): # Server arguments extra_args = extra_args if extra_args is not None else [] server_args = [ @@ -50,7 +60,7 @@ def can_initialize(model: str, extra_args: list[str] | None = None): model, server_args, max_wait_seconds=1500, # Due to FlashInfer compile - override_hf_configs=dummy_hf_overrides, + override_hf_configs=hf_overrides, ) as server: client = server.get_client() # Make a simple request to verify the server works @@ -77,28 +87,33 @@ def can_initialize(model: str, extra_args: list[str] | None = None): def test_llama4_fp8_tensor_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1") monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput") - can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8") + can_initialize( + "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM + ) -@pytest.mark.skip(reason="Works, but takes too long to run") def test_llama4_fp8_tensor_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1") monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency") - can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8") + can_initialize( + "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM + ) -@pytest.mark.skip(reason="Works, but takes too long to run") def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1") monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput") - can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4") + can_initialize( + "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM + ) -@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options") def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1") monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency") - can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4") + can_initialize( + "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM + ) ## DeepSeekV3 ## @@ -106,7 +121,7 @@ def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1") - can_initialize("deepseek-ai/DeepSeek-V3.1") + can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT) @pytest.mark.skip( @@ -118,26 +133,25 @@ def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch): def test_deepseek_fp8_block_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1") monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput") - can_initialize("deepseek-ai/DeepSeek-V3.1") + can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT) def test_deepseek_fp8_block_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1") monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency") - can_initialize("deepseek-ai/DeepSeek-V3.1") + can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT) def test_deepseek_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1") monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput") - can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2") + can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT) -@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options") def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1") monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency") - can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2") + can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT) ## GPT-OSS ## @@ -145,14 +159,14 @@ def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1") - can_initialize("openai/gpt-oss-20b") + can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT) def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1") - can_initialize("openai/gpt-oss-20b") + can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT) def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1") - can_initialize("openai/gpt-oss-20b") + can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)