From d328f7894f140fdc643dc1aa5fe80f4596e6f418 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Tue, 2 Sep 2025 22:15:06 +0200 Subject: [PATCH] [CI] Enable all hf transformers baselines in test_hybrid (#23936) Signed-off-by: Thomas Parnell --- .../models/language/generation/test_hybrid.py | 76 ++++++------------- tests/models/registry.py | 13 +++- 2 files changed, 32 insertions(+), 57 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 3cacbdcfbe86..9e97e3fa6577 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -34,17 +34,6 @@ HYBRID_MODELS = [ "LiquidAI/LFM2-1.2B", ] -HF_UNSUPPORTED_MODELS = [ - # The HF transformers implementation of - # Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test - # doesn't compare vLLM output with HF output. - # See https://github.com/huggingface/transformers/pull/35943 - "yujiepan/mamba2-codestral-v0.1-tiny-random", - # transformers 4.55 is still producing garbage for this model - # TODO(tdoublep): follow-up on transformers side - "ibm-granite/granite-4.0-tiny-preview" -] - V1_SUPPORTED_MODELS = [ "state-spaces/mamba-130m-hf", "ai21labs/Jamba-tiny-dev", @@ -90,20 +79,13 @@ def test_models( try: model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - hf_version_check = model_info.check_transformers_version( - on_fail="return") + model_info.check_transformers_version(on_fail="skip") except ValueError: - hf_version_check = None - - if hf_version_check is not None: - print(f"Skipping transformers comparison because: {hf_version_check}") + pass with hf_runner(model) as hf_model: - if model not in HF_UNSUPPORTED_MODELS and hf_version_check is None: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) - else: - hf_outputs = None + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "0") @@ -121,7 +103,7 @@ def test_models( else: vllm_v1_outputs = None - if hf_outputs is not None and vllm_v0_outputs is not None: + if vllm_v0_outputs is not None: check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v0_outputs, @@ -130,12 +112,10 @@ def test_models( ) if model in V1_SUPPORTED_MODELS: - ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs - assert ref_outputs is not None check_logprobs_close( - outputs_0_lst=ref_outputs, + outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v1_outputs, - name_0="hf" if hf_outputs is not None else "vllm-v0", + name_0="hf", name_1="vllm-v1", ) @@ -402,11 +382,8 @@ def test_full_cuda_graph( pass with hf_runner(model) as hf_model: - if model not in HF_UNSUPPORTED_MODELS: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) - else: - hf_outputs = None + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "0") @@ -421,7 +398,7 @@ def test_full_cuda_graph( vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - if hf_outputs is not None and vllm_v0_outputs is not None: + if vllm_v0_outputs is not None: check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v0_outputs, @@ -429,12 +406,10 @@ def test_full_cuda_graph( name_1="vllm-v0", ) - ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs - assert ref_outputs is not None check_logprobs_close( - outputs_0_lst=ref_outputs, + outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v1_outputs, - name_0="hf" if hf_outputs is not None else "vllm-v0", + name_0="hf", name_1="vllm-v1", ) @@ -460,11 +435,8 @@ def test_fp32_state( pass with hf_runner(model) as hf_model: - if model not in HF_UNSUPPORTED_MODELS: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) - else: - hf_outputs = None + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "0") @@ -480,18 +452,16 @@ def test_fp32_state( vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - if hf_outputs is not None: - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_v0_outputs, - name_0="hf", - name_1="vllm-v0", - ) - - ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs check_logprobs_close( - outputs_0_lst=ref_outputs, + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_v0_outputs, + name_0="hf", + name_1="vllm-v0", + ) + + check_logprobs_close( + outputs_0_lst=hf_outputs, outputs_1_lst=vllm_v1_outputs, - name_0="hf" if hf_outputs is not None else "vllm-v0", + name_0="hf", name_1="vllm-v1", ) diff --git a/tests/models/registry.py b/tests/models/registry.py index 3b5cec2dc702..4cf3dd6e08ce 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -154,7 +154,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5", trust_remote_code=True), "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1", - min_transformers_version="4.56.0", + min_transformers_version="4.55.3", extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}), # noqa: E501 "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m", {"1b": "bigscience/bloomz-1b1"}), @@ -208,7 +208,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "GptOssForCausalLM": _HfExamplesInfo("lmsys/gpt-oss-20b-bf16"), "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"), "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"), - "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"), # noqa: E501 + "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview", # noqa: E501 + min_transformers_version="4.55.3"), "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"), # noqa: E501 "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1", trust_remote_code=True), @@ -228,7 +229,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True), "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini", - min_transformers_version="4.56.0", + min_transformers_version="4.55.3", extras={ "tiny": "ai21labs/Jamba-tiny-dev", "random": "ai21labs/Jamba-tiny-random", # noqa: E501 @@ -244,7 +245,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 is_available_online=False), "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"), - "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"), + "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1", + min_transformers_version="4.55.3", + extras={ + "random": "yujiepan/mamba2-codestral-v0.1-tiny-random", # noqa: E501 + }), "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501 "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16", trust_remote_code=True),