diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d4b6f4077ab3..98daebcc0693 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -930,6 +930,8 @@ steps: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py - vllm/compilation/ # can affect pattern matching - vllm/model_executor/layers/layernorm.py diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 2e1b595a4389..661172e1965b 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -47,12 +47,8 @@ if current_platform.is_cuda(): ModelBackendTestCase( # Use smaller model for L40s in CI model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", - # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell - # so FI attention+fp8_quant is at least tested once model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - backend=AttentionBackendEnum.FLASHINFER - if is_blackwell() - else AttentionBackendEnum.TRITON_ATTN, + backend=AttentionBackendEnum.TRITON_ATTN, matches=Matches( attention_fusion=32, allreduce_fusion=65, @@ -65,9 +61,9 @@ if current_platform.is_cuda(): model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), # TODO FlashInfer attn broken on Hopper with kvcache=fp8: # https://github.com/vllm-project/vllm/issues/28568 - # TODO FlashInfer attn broken on Blackwell for llama4: - # https://github.com/vllm-project/vllm/issues/28604 - backend=AttentionBackendEnum.TRITON_ATTN, + backend=AttentionBackendEnum.FLASHINFER + if is_blackwell() + else AttentionBackendEnum.TRITON_ATTN, matches=Matches( attention_fusion=48, allreduce_fusion=96,