mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 00:15:01 +08:00
Signed-off-by: zhewenli <zhewenli@meta.com>
This commit is contained in:
parent
e42bd8c2e3
commit
f8b19c0ffd
@ -1068,7 +1068,7 @@ steps:
|
|||||||
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
# Wrap with quotes to escape yaml
|
# Wrap with quotes to escape yaml
|
||||||
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
|
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
|
||||||
|
|
||||||
- label: Blackwell Fusion E2E Tests # 30 min
|
- label: Blackwell Fusion E2E Tests # 30 min
|
||||||
@ -1095,10 +1095,11 @@ steps:
|
|||||||
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: ROCm GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
agent_pool: mi325_1
|
||||||
|
mirror_hardwares: [amdproduction]
|
||||||
optional: true # run on nightlies
|
optional: true # run on nightlies
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tests/evals/gpt_oss
|
- tests/evals/gpt_oss
|
||||||
@ -1107,7 +1108,7 @@ steps:
|
|||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
commands:
|
commands:
|
||||||
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
||||||
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
- VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
||||||
|
|
||||||
- label: Blackwell Quantized MoE Test
|
- label: Blackwell Quantized MoE Test
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
|
|||||||
@ -755,8 +755,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
|||||||
|
|
||||||
self.w13_weight = w13_weight
|
self.w13_weight = w13_weight
|
||||||
self.w2_weight = w2_weight
|
self.w2_weight = w2_weight
|
||||||
layer.w13_weight = Parameter(w13_weight.data, requires_grad=False)
|
layer.w13_weight = Parameter(w13_weight.storage.data, requires_grad=False)
|
||||||
layer.w2_weight = Parameter(w2_weight.data, requires_grad=False)
|
layer.w2_weight = Parameter(w2_weight.storage.data, requires_grad=False)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
|
raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user