mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 02:05:01 +08:00
Re-enable FlashInfer for Llama4 on Blackwell in e2e fusion tests (#28966)
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
parent
0c80efd94f
commit
61728cd1df
@ -930,6 +930,8 @@ steps:
|
|||||||
- csrc/quantization/fp4/
|
- csrc/quantization/fp4/
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/v1/worker/
|
||||||
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
- vllm/compilation/
|
- vllm/compilation/
|
||||||
# can affect pattern matching
|
# can affect pattern matching
|
||||||
- vllm/model_executor/layers/layernorm.py
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
|||||||
@ -47,12 +47,8 @@ if current_platform.is_cuda():
|
|||||||
ModelBackendTestCase(
|
ModelBackendTestCase(
|
||||||
# Use smaller model for L40s in CI
|
# Use smaller model for L40s in CI
|
||||||
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
|
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
|
||||||
# TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
|
|
||||||
# so FI attention+fp8_quant is at least tested once
|
|
||||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||||
backend=AttentionBackendEnum.FLASHINFER
|
backend=AttentionBackendEnum.TRITON_ATTN,
|
||||||
if is_blackwell()
|
|
||||||
else AttentionBackendEnum.TRITON_ATTN,
|
|
||||||
matches=Matches(
|
matches=Matches(
|
||||||
attention_fusion=32,
|
attention_fusion=32,
|
||||||
allreduce_fusion=65,
|
allreduce_fusion=65,
|
||||||
@ -65,9 +61,9 @@ if current_platform.is_cuda():
|
|||||||
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
|
||||||
# TODO FlashInfer attn broken on Hopper with kvcache=fp8:
|
# TODO FlashInfer attn broken on Hopper with kvcache=fp8:
|
||||||
# https://github.com/vllm-project/vllm/issues/28568
|
# https://github.com/vllm-project/vllm/issues/28568
|
||||||
# TODO FlashInfer attn broken on Blackwell for llama4:
|
backend=AttentionBackendEnum.FLASHINFER
|
||||||
# https://github.com/vllm-project/vllm/issues/28604
|
if is_blackwell()
|
||||||
backend=AttentionBackendEnum.TRITON_ATTN,
|
else AttentionBackendEnum.TRITON_ATTN,
|
||||||
matches=Matches(
|
matches=Matches(
|
||||||
attention_fusion=48,
|
attention_fusion=48,
|
||||||
allreduce_fusion=96,
|
allreduce_fusion=96,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user